# 
# packages_to_install <- c('tidyverse', 'ggplot2', 'lme4') # main ones
# install.packages(packages_to_install)
# 
# optional_package<- c('heplots','ggeffects','ggpubr','visreg', 'lattice', 'psych', '')
# install.packages(optional_package)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk along text.

see this book for R introduction aimed at psychologists - https://alexander-pastukhov.github.io/data-analysis-using-r-for-psychology/software.html

Introduction

This is a R markdown, it is like a jupyter notebook or matlab live-script. It can incorporate blocks of text with blocks of R-script.

The schedule is to first introduce common data types and variables, followed by functions and libraries for plotting. Afterwards we would apply typical analyses like anovas and mixed effect models to tutorial datasets. We will conclude by showing how to plot results of this data, as well as simulate data for instance for power testing.

1. Data Types and Structures

This can be used as a cheat sheet, but the main thing to remember is that functions would act differently depending on data types, so it is important to know what datatypes you have in to the data.

Data Types
Data Types
# Basic types
num <- 42 # number can be double
char <- "hello" # string
bool <- TRUE # logical variable that you can use as a condition or for filtering

# Vectors
vec <- c(1, 2, 3, 4)
vec[1]
## [1] 1
vec[0]
## numeric(0)
vec[5]
## [1] NA
# Named vector
(vec <- c('height'=2, 'weight'=3, 'gender'=0))
## height weight gender 
##      2      3      0
vec['height']
## height 
##      2
names(vec) <- c('H','W','G')
vec
## H W G 
## 2 3 0
# Lists - can hold different types of variables, e.g., list of vectors, lists, or dataframes. can be useful when saving a json file.
lst <- list(name="Boris Johnson", age=54, scores=c(88, 90, 95))

# Matrices
mat <- matrix(1:9, nrow=3)

# Data Frames
student_data <- data.frame(
  name = c("Ayse", "Ali", "Ece"),
  score = c(90, 85, 88)
)
(vec <- c(1:10))
##  [1]  1  2  3  4  5  6  7  8  9 10
vec[1]
## [1] 1
vec[0]
## integer(0)
vec[20]
## [1] NA
# Named vector
(vec <- c('height'=2, 'weight'=3, 'gender'=0))
## height weight gender 
##      2      3      0
vec['height']
## height 
##      2
names(vec) <- c('H','W','G')
vec
## H W G 
## 2 3 0
vec[2]
## W 
## 3
vec[[2]]
## [1] 3
# read this for more detailed explanation - https://alexander-pastukhov.github.io/data-analysis-using-r-for-psychology/tables.html



(vec <- c(1:10))
##  [1]  1  2  3  4  5  6  7  8  9 10
vec[c(1,2,3)]
## [1] 1 2 3
vec[c(1,1,1,1,1,2,3)]
## [1] 1 1 1 1 1 2 3
vec[c(-1,-2,-3)] # all but 1st 2nd, 3rd 
## [1]  4  5  6  7  8  9 10
5:2
## [1] 5 4 3 2
seq(5,2,by=-1)
## [1] 5 4 3 2
## you can add vectors, but weirdly you don't get an error if there are not of different length

# Unfortunately, R has a solution and that is not an error or a warning. Rather, if the length of the longer vector is a multiple of the shorter vector length, the shorter vector is “recycled” , i.e., repeated N-times (where N = length (longer vector) / length (shorter vector) )

x <- 1:6
y <- c(2, 3)
print(x + y)
## [1] 3 5 5 7 7 9

Variables can be useful e.g., when we want to iterate or apply a function to each element in a vector or organised data into a data frame. You can perform simple arithmetic operations on variables, which can be useful when you want to do something not available in a package.

a = 5
b = 2

print(a+b)
## [1] 7
print(a*b)
## [1] 10
print(paste('a to the power of b',a^b))
## [1] "a to the power of b 25"
print(paste('string one', 'string two')) # our first use of a function, paste can concatenate strings, by default separated by space
## [1] "string one string two"
# note that paste converted the number a^b to a character
# you could often convert simple variables in format, which may be useful later on
print(a)
## [1] 5
print(as.character(a))
## [1] "5"
TEMP <- "51"
print(TEMP)
## [1] "51"
print(as.numeric(TEMP))
## [1] 51
correct <- 1
print(correct)
## [1] 1
print(as.logical(correct))
## [1] TRUE
## you can provide two vectors of sample length to paste and it will concatenate them accordingly
(nth <- paste(1:12)) # outer brackets are a short hand for print - function
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"
(nth<- paste(1:12, c("st", "nd", "rd", rep("th",9)))) # ok but we do not want space between the two strings, so we can give the function an argument to do this first lets check the help menu ?paste - this would often show arguments for a function, we see something called sep - which controls the separation between strings
##  [1] "1 st"  "2 nd"  "3 rd"  "4 th"  "5 th"  "6 th"  "7 th"  "8 th"  "9 th"  "10 th" "11 th" "12 th"
(nth<- paste(1:12, c("st", "nd", "rd", rep("th",9)), sep="")) # in R sometimes there are short hands for commonly used functions, e.g., paste0 takes the default value of sep="", so
##  [1] "1st"  "2nd"  "3rd"  "4th"  "5th"  "6th"  "7th"  "8th"  "9th"  "10th" "11th" "12th"
(nth<- paste0(1:12, c("st", "nd", "rd", rep("th",9))))
##  [1] "1st"  "2nd"  "3rd"  "4th"  "5th"  "6th"  "7th"  "8th"  "9th"  "10th" "11th" "12th"

Working with strings can be particularly useful when we want to deal with file names, but before I show that I want to mention some simple functions that can be particularly helpful when combining variables together We already saw one of these functions - rep

(rep(1,2)) # this can repeat a variable multiple times, so for example if we had a vector of 1s (males) and 0s (females) we could repeat it to match our data. Lets say we have data from 10 males and 10 females
## [1] 1 1
(rep(c(1,0),10)) # a but this repeated the whole vector 10 times, maybe we have first the 10  males and later 10 females
##  [1] 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0
(rep(c(1,0),each=10)) # ok and what if now we collect another 20 people in same order
##  [1] 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
(temp_gender <- rep(c(1,0),each=10,times=2)) 
##  [1] 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
# rep works also with factors or strings
gender <-c("male","female")
(rep(gender, each=10))
##  [1] "male"   "male"   "male"   "male"   "male"   "male"   "male"   "male"   "male"   "male"   "female" "female" "female"
## [14] "female" "female" "female" "female" "female" "female" "female"
gender_factor <-factor(temp_gender, levels = c(1,0), labels = c("male","female"))

summary(gender_factor)
##   male female 
##     20     20
unclass(gender_factor)
##  [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## attr(,"levels")
## [1] "male"   "female"

Now that we have a vector temp_gender we may want to manipulate it. We can change it with maths or with logical indexing - this is the most time-consuming bits often in research is clearning and organising the data. I will first show the most common opperations with basic R since that would be most similar to other programming languages like python and matlab. I will then try and show some short hands that can make things a bit easier to code but achieve the same Because we have a vector we can either do an operation to every element in the vector or what most often we want is to do some operations for some elements and another operation for other elements. For instance, we may want to recode our variables to be -1 for males and 1 for females.

temp_gender_new <- numeric(length(temp_gender))

# we can interate over numbers
for (i in 1:5 ) {
  print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
# we can assign conditions to do something only when a condition is met - here when the i is odd rather than even
for (i in 1:5 ) {
  if (i %% 2){
    print(i)
  }
}
## [1] 1
## [1] 3
## [1] 5
# Loop through each element in temp_gender
for (i in seq_along(temp_gender)) {
  if (temp_gender[i] == 0) {
    temp_gender_new[i] <- -1  # if value is 0, assign -1
  } else {
    temp_gender_new[i] <- 1   # otherwise, assign 1
  }
}

(rbind(temp_gender,temp_gender_new)) # stack rows
##                 [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19]
## temp_gender        1    1    1    1    1    1    1    1    1     1     0     0     0     0     0     0     0     0     0
## temp_gender_new    1    1    1    1    1    1    1    1    1     1    -1    -1    -1    -1    -1    -1    -1    -1    -1
##                 [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37]
## temp_gender         0     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0
## temp_gender_new    -1     1     1     1     1     1     1     1     1     1     1    -1    -1    -1    -1    -1    -1    -1
##                 [,38] [,39] [,40]
## temp_gender         0     0     0
## temp_gender_new    -1    -1    -1
### ok you might be wondering if there is a short hand as this is something likely we would have to do quite often and there is
temp_gender_new <- ifelse(temp_gender == 0, -1 , 1) # what this does is if temp_gender variable is equal to 0 we make it -1 otherwise we code everything else as 1

# although we did this with for loops and if conditions, for this specific situation the easiest may be a simple arithmetic solution - multiplication by scaler performs the operation on each element
temp_gender_recode <- temp_gender*2 - 1
(rbind(temp_gender, temp_gender_new, temp_gender_recode))
##                    [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19]
## temp_gender           1    1    1    1    1    1    1    1    1     1     0     0     0     0     0     0     0     0     0
## temp_gender_new       1    1    1    1    1    1    1    1    1     1    -1    -1    -1    -1    -1    -1    -1    -1    -1
## temp_gender_recode    1    1    1    1    1    1    1    1    1     1    -1    -1    -1    -1    -1    -1    -1    -1    -1
##                    [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36]
## temp_gender            0     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0
## temp_gender_new       -1     1     1     1     1     1     1     1     1     1     1    -1    -1    -1    -1    -1    -1
## temp_gender_recode    -1     1     1     1     1     1     1     1     1     1     1    -1    -1    -1    -1    -1    -1
##                    [,37] [,38] [,39] [,40]
## temp_gender            0     0     0     0
## temp_gender_new       -1    -1    -1    -1
## temp_gender_recode    -1    -1    -1    -1

As you can see there multiple ways to achieve the same goal, this is very common in programming, especially in open software where different people might have created different packages that do basically the same thing. It can be tricky as sometimes the packages appear to do the same things but there are subtle differences, often this is easier to detect when re-formating data, but may be more subtle when working with statistical packages e.g., that calcualte effect size or marignal means. But for now back on task, we now know the basics of programming that we would need in most psychology situations. We know how to iterate over things and provide conditions, what we do not know is how to write our own functions.

# often there would be some pre-made function that does what you need, but sometimes it may be useful to use a function to wrap around a piece of code you use often
# we I simply show the syntax of a very simple function, but later on the workshop we will use a bit more useful one.
# main points are we can define something we pass it arguments, (x,y), you make some computation and return the output.
add_two_numbers <- function(x=1, y=1) {
  result <- x + y
  return(result)
}
print(add_two_numbers())
## [1] 2
print(add_two_numbers(x=5, y =10))
## [1] 15

Ok after seeing some basics of programming it may be a good time to actually start doing some things we might encounter in the real world. I in my opinion doing something is the best way to learn, you should not be too scared of making mistakes and the syntax is something internet or chatGPT can help you with. There are extensive resources online to introduction to R showing how to use different datatypes, vectors and functions, here I have listed some of them.

Most famous ones are the Navarro R book - https://learningstatisticswithr.com/ For people with no experience with R - https://bookdown.org/rdpeng/rprogdatascience/

https://intro2r.com/ https://www.w3schools.com/r/r_intro.asp https://r4ds.had.co.nz/index.html https://rpubs.com/ruruu127/417821 https://alexander-pastukhov.github.io/data-analysis-using-r-for-psychology/index.html - Aimed at Psychologists

tidyverse is a package in R that has gained popularity, it is effectively a short handed way to achieve what you can do also with base R with more code. Some people love it, some people hate it. My opinion is that base R is more similar to other languages so it may be easier to read. The “speed” you gain from running things in tidyverse is unlikely to be important for most psychologists. It can act a bit too much as a black box when you are learning things. However, it can be easier sometimes for plotting and data-wrangling. I will try and focus on base R in this tutorial but will try an include tidyverse alternatives to give an introduction to it as well. One thing to note is that many packages may have functions, operators (e.g., *, %>%) that are named the same so they can interact. This means when using multiple packages one has to be slighly careful and expect sometimes errors.

https://www.tidyverse.org/learn/ https://bookdown.org/yih_huynh/Guide-to-R-Book/tidyverse.html

However, reading too much into “theory” may not be useful, as quite likely one will forget exactly what command performed a certain operation unless they do it repeatedly.

# Number of subjects
n_subjects <- 20

# Loop over subject numbers
for (i in 1:n_subjects) {
  
  # Create a subject ID like "sub-01"
  sub_id <- sprintf("sub-%02d", i)
  
  # Create a folder for the subject
  dir.create(sub_id, showWarnings = FALSE)
  
  # Create a simple data frame (optional content)
  demo_data <- data.frame(
    subject = sub_id,
    age = sample(18:65, 1),
    condition = sample(c("control", "treatment"), 1)
  )
  
  # Define file path: sub-01/sub-01.csv
  file_path <- file.path(sub_id, paste0(sub_id, ".csv"))
  
  # Write the CSV file
  write.csv(demo_data, file = file_path, row.names = FALSE)
}
main_dir <- (".")

subject_folders <- list.dirs(main_dir, full.names = TRUE, recursive = FALSE) # or better yet to use a pattern
# from grep can use regular expressions
folder_list <- gsub("\\./", "",subject_folders)
## alternatively
(subject_folders <- list.files(main_dir, full.names = TRUE, recursive = FALSE, pattern="sub")) # or better yet to use a pattern
##  [1] "./sub-01" "./sub-02" "./sub-03" "./sub-04" "./sub-05" "./sub-06" "./sub-07" "./sub-08" "./sub-09" "./sub-10" "./sub-11"
## [12] "./sub-12" "./sub-13" "./sub-14" "./sub-15" "./sub-16" "./sub-17" "./sub-18" "./sub-19" "./sub-20"
all_data_list <- list()


# Loop through each subject folder
for (folder in subject_folders) {
  
  # Find CSV files in that folder
  csv_files <- list.files(folder, pattern ="^sub-\\d{2}\\.csv$", full.names = TRUE) # - "\\.csv$" - find any csv file; - find csv file that matches pattern sub-{2 numbers}.csv
  
  # Only proceed if at least one CSV is found
  if (length(csv_files) > 0) {
    
    # (Assume the first one is the subject CSV)
    csv_path <- csv_files[1]
    
    # Read it
    df <- read.csv(csv_path)
    
    # Add subject info from folder name
    df$subject <- basename(folder)
    
    # Append to list
    all_data_list[[basename(folder)]] <- df
    
  } else {
    warning(paste("No CSV found in", folder))
  }
}

# Combine all into one data frame
combined_data <- do.call(rbind, all_data_list)

# Show result
head(combined_data)
##        subject age condition
## sub-01  sub-01  61 treatment
## sub-02  sub-02  23   control
## sub-03  sub-03  65   control
## sub-04  sub-04  32   control
## sub-05  sub-05  57   control
## sub-06  sub-06  32 treatment
Reg Ex cheat sheet
Reg Ex cheat sheet

Can we achieve the same with less code?

library(tidyverse)
## ── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ✔ readr     1.4.0     
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
main_dir <- "."

# Get all subject folders that start with "sub-"
subject_dirs <- list.dirs(main_dir, recursive = FALSE) %>%
  keep(~ str_detect(basename(.x), "^sub-\\d+"))

# temp_name <- list.dirs(main_dir, recursive = FALSE) 
#k eep(temp_name, ~ str_detect(basename(.x), "^sub-\\d+"))

# Read and combine the first .csv file from each subject folder
combined_data <- subject_dirs %>%
  map_dfr(function(folder) {
    csv_file <- list.files(folder, pattern = "\\.csv$", full.names = TRUE)[1]
    if (!is.na(csv_file) && file.exists(csv_file)) {
      read_csv(csv_file) %>%
        mutate(subject = basename(folder))
    }
  })
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
## 
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   subject = col_character(),
##   age = col_double(),
##   condition = col_character()
## )
head(combined_data)
## # A tibble: 6 × 3
##   subject   age condition
##   <chr>   <dbl> <chr>    
## 1 sub-01     61 treatment
## 2 sub-02     23 control  
## 3 sub-03     65 control  
## 4 sub-04     32 control  
## 5 sub-05     57 control  
## 6 sub-06     32 treatment

We achieved the same goal with less code, but there are a lot of things going on that may be difficult to follow so I will provide a bit more general introduction to tidyverse here The basic idea of the package is to apply functions to elements of data.frames more easily and automatically, without us having to do the for loops Lets re-start a new data so we can show some of the functionality of tidyverse - which is a large collection of packages most common being ggplot and dplyr - they can be installed separately if you have issues with install.packages(“tidyverse”)

We will demonstrate some of the main features and plotting using the iris dataset iris is a tidy dataset. Each row is an observation of an individual iris, each column is a different variable. This can be thought of as long form data (foreshadowing) - Basic idea is that each row might be a trial from experimental paradigm, or a subject, and each column could be a variable, e.g., Condition, Reaction Time, Age, Gender, etc. This format of storing data is very common throughout statistics and is not really specific to R

— see readxl for reading excel data e.g., read_excel(“my_excel_file.xls”, sheet=2) / read_excel(“my_excel_file.xls”, sheet=“addendum”) — saveRDS and readRDS can be used when you want to save a vector or R object that is not a csv

print(dim(iris))
## [1] 150   5
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
print(class(iris))
## [1] "data.frame"
colnames(iris)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"
glimpse(iris)
## Rows: 150
## Columns: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4…
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3.0, 3.0, 4.0, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4…
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.6, 1.4, 1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5, 1.7…
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.2, 0.1, 0.1, 0.2, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2…
## $ Species      <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa,…
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width          Species  
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100   setosa    :50  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300   versicolor:50  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300   virginica :50  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199                  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800                  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
# this is a cheat to call a function from a package without loading the full package
psych::describe(iris)
##              vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## Sepal.Length    1 150 5.84 0.83   5.80    5.81 1.04 4.3 7.9   3.6  0.31    -0.61 0.07
## Sepal.Width     2 150 3.06 0.44   3.00    3.04 0.44 2.0 4.4   2.4  0.31     0.14 0.04
## Petal.Length    3 150 3.76 1.77   4.35    3.76 1.85 1.0 6.9   5.9 -0.27    -1.42 0.14
## Petal.Width     4 150 1.20 0.76   1.30    1.18 1.04 0.1 2.5   2.4 -0.10    -1.36 0.06
## Species*        5 150 2.00 0.82   2.00    2.00 1.48 1.0 3.0   2.0  0.00    -1.52 0.07

Conditional operators -

== equal

<= less or equal

>= more or equal

& - and

OR

# One can grab data from specific rows or columns but simply indexing the data e.g., 2nd row 3rd column
iris[2,3]
## [1] 1.4
iris$Sepal.Length # grab whole column
##   [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7
##  [31] 4.8 5.4 5.2 5.5 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2
##  [61] 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4 6.0 6.7 6.3 5.6 5.5
##  [91] 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.0
## [121] 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9
iris['Sepal.Length']
##     Sepal.Length
## 1            5.1
## 2            4.9
## 3            4.7
## 4            4.6
## 5            5.0
## 6            5.4
## 7            4.6
## 8            5.0
## 9            4.4
## 10           4.9
## 11           5.4
## 12           4.8
## 13           4.8
## 14           4.3
## 15           5.8
## 16           5.7
## 17           5.4
## 18           5.1
## 19           5.7
## 20           5.1
## 21           5.4
## 22           5.1
## 23           4.6
## 24           5.1
## 25           4.8
## 26           5.0
## 27           5.0
## 28           5.2
## 29           5.2
## 30           4.7
## 31           4.8
## 32           5.4
## 33           5.2
## 34           5.5
## 35           4.9
## 36           5.0
## 37           5.5
## 38           4.9
## 39           4.4
## 40           5.1
## 41           5.0
## 42           4.5
## 43           4.4
## 44           5.0
## 45           5.1
## 46           4.8
## 47           5.1
## 48           4.6
## 49           5.3
## 50           5.0
## 51           7.0
## 52           6.4
## 53           6.9
## 54           5.5
## 55           6.5
## 56           5.7
## 57           6.3
## 58           4.9
## 59           6.6
## 60           5.2
## 61           5.0
## 62           5.9
## 63           6.0
## 64           6.1
## 65           5.6
## 66           6.7
## 67           5.6
## 68           5.8
## 69           6.2
## 70           5.6
## 71           5.9
## 72           6.1
## 73           6.3
## 74           6.1
## 75           6.4
## 76           6.6
## 77           6.8
## 78           6.7
## 79           6.0
## 80           5.7
## 81           5.5
## 82           5.5
## 83           5.8
## 84           6.0
## 85           5.4
## 86           6.0
## 87           6.7
## 88           6.3
## 89           5.6
## 90           5.5
## 91           5.5
## 92           6.1
## 93           5.8
## 94           5.0
## 95           5.6
## 96           5.7
## 97           5.7
## 98           6.2
## 99           5.1
## 100          5.7
## 101          6.3
## 102          5.8
## 103          7.1
## 104          6.3
## 105          6.5
## 106          7.6
## 107          4.9
## 108          7.3
## 109          6.7
## 110          7.2
## 111          6.5
## 112          6.4
## 113          6.8
## 114          5.7
## 115          5.8
## 116          6.4
## 117          6.5
## 118          7.7
## 119          7.7
## 120          6.0
## 121          6.9
## 122          5.6
## 123          7.7
## 124          6.3
## 125          6.7
## 126          7.2
## 127          6.2
## 128          6.1
## 129          6.4
## 130          7.2
## 131          7.4
## 132          7.9
## 133          6.4
## 134          6.3
## 135          6.1
## 136          7.7
## 137          6.3
## 138          6.4
## 139          6.0
## 140          6.9
## 141          6.7
## 142          6.9
## 143          5.8
## 144          6.8
## 145          6.7
## 146          6.7
## 147          6.3
## 148          6.5
## 149          6.2
## 150          5.9
iris[['Sepal.Length']]
##   [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7
##  [31] 4.8 5.4 5.2 5.5 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2
##  [61] 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4 6.0 6.7 6.3 5.6 5.5
##  [91] 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.0
## [121] 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9
(iris[,1]) # grab whole column but use numeric index
##   [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7
##  [31] 4.8 5.4 5.2 5.5 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2
##  [61] 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4 6.0 6.7 6.3 5.6 5.5
##  [91] 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.0
## [121] 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9
head(iris[,-1]) # grab all columns but first one
##   Sepal.Width Petal.Length Petal.Width Species
## 1         3.5          1.4         0.2  setosa
## 2         3.0          1.4         0.2  setosa
## 3         3.2          1.3         0.2  setosa
## 4         3.1          1.5         0.2  setosa
## 5         3.6          1.4         0.2  setosa
## 6         3.9          1.7         0.4  setosa
iris[iris$Species == "setosa", ] # grab all columns but focus on rows that are from the setosa species
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 12          4.8         3.4          1.6         0.2  setosa
## 13          4.8         3.0          1.4         0.1  setosa
## 14          4.3         3.0          1.1         0.1  setosa
## 15          5.8         4.0          1.2         0.2  setosa
## 16          5.7         4.4          1.5         0.4  setosa
## 17          5.4         3.9          1.3         0.4  setosa
## 18          5.1         3.5          1.4         0.3  setosa
## 19          5.7         3.8          1.7         0.3  setosa
## 20          5.1         3.8          1.5         0.3  setosa
## 21          5.4         3.4          1.7         0.2  setosa
## 22          5.1         3.7          1.5         0.4  setosa
## 23          4.6         3.6          1.0         0.2  setosa
## 24          5.1         3.3          1.7         0.5  setosa
## 25          4.8         3.4          1.9         0.2  setosa
## 26          5.0         3.0          1.6         0.2  setosa
## 27          5.0         3.4          1.6         0.4  setosa
## 28          5.2         3.5          1.5         0.2  setosa
## 29          5.2         3.4          1.4         0.2  setosa
## 30          4.7         3.2          1.6         0.2  setosa
## 31          4.8         3.1          1.6         0.2  setosa
## 32          5.4         3.4          1.5         0.4  setosa
## 33          5.2         4.1          1.5         0.1  setosa
## 34          5.5         4.2          1.4         0.2  setosa
## 35          4.9         3.1          1.5         0.2  setosa
## 36          5.0         3.2          1.2         0.2  setosa
## 37          5.5         3.5          1.3         0.2  setosa
## 38          4.9         3.6          1.4         0.1  setosa
## 39          4.4         3.0          1.3         0.2  setosa
## 40          5.1         3.4          1.5         0.2  setosa
## 41          5.0         3.5          1.3         0.3  setosa
## 42          4.5         2.3          1.3         0.3  setosa
## 43          4.4         3.2          1.3         0.2  setosa
## 44          5.0         3.5          1.6         0.6  setosa
## 45          5.1         3.8          1.9         0.4  setosa
## 46          4.8         3.0          1.4         0.3  setosa
## 47          5.1         3.8          1.6         0.2  setosa
## 48          4.6         3.2          1.4         0.2  setosa
## 49          5.3         3.7          1.5         0.2  setosa
## 50          5.0         3.3          1.4         0.2  setosa
(iris[iris$Species == "setosa" | iris$Species == "virginica", ])
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 1            5.1         3.5          1.4         0.2    setosa
## 2            4.9         3.0          1.4         0.2    setosa
## 3            4.7         3.2          1.3         0.2    setosa
## 4            4.6         3.1          1.5         0.2    setosa
## 5            5.0         3.6          1.4         0.2    setosa
## 6            5.4         3.9          1.7         0.4    setosa
## 7            4.6         3.4          1.4         0.3    setosa
## 8            5.0         3.4          1.5         0.2    setosa
## 9            4.4         2.9          1.4         0.2    setosa
## 10           4.9         3.1          1.5         0.1    setosa
## 11           5.4         3.7          1.5         0.2    setosa
## 12           4.8         3.4          1.6         0.2    setosa
## 13           4.8         3.0          1.4         0.1    setosa
## 14           4.3         3.0          1.1         0.1    setosa
## 15           5.8         4.0          1.2         0.2    setosa
## 16           5.7         4.4          1.5         0.4    setosa
## 17           5.4         3.9          1.3         0.4    setosa
## 18           5.1         3.5          1.4         0.3    setosa
## 19           5.7         3.8          1.7         0.3    setosa
## 20           5.1         3.8          1.5         0.3    setosa
## 21           5.4         3.4          1.7         0.2    setosa
## 22           5.1         3.7          1.5         0.4    setosa
## 23           4.6         3.6          1.0         0.2    setosa
## 24           5.1         3.3          1.7         0.5    setosa
## 25           4.8         3.4          1.9         0.2    setosa
## 26           5.0         3.0          1.6         0.2    setosa
## 27           5.0         3.4          1.6         0.4    setosa
## 28           5.2         3.5          1.5         0.2    setosa
## 29           5.2         3.4          1.4         0.2    setosa
## 30           4.7         3.2          1.6         0.2    setosa
## 31           4.8         3.1          1.6         0.2    setosa
## 32           5.4         3.4          1.5         0.4    setosa
## 33           5.2         4.1          1.5         0.1    setosa
## 34           5.5         4.2          1.4         0.2    setosa
## 35           4.9         3.1          1.5         0.2    setosa
## 36           5.0         3.2          1.2         0.2    setosa
## 37           5.5         3.5          1.3         0.2    setosa
## 38           4.9         3.6          1.4         0.1    setosa
## 39           4.4         3.0          1.3         0.2    setosa
## 40           5.1         3.4          1.5         0.2    setosa
## 41           5.0         3.5          1.3         0.3    setosa
## 42           4.5         2.3          1.3         0.3    setosa
## 43           4.4         3.2          1.3         0.2    setosa
## 44           5.0         3.5          1.6         0.6    setosa
## 45           5.1         3.8          1.9         0.4    setosa
## 46           4.8         3.0          1.4         0.3    setosa
## 47           5.1         3.8          1.6         0.2    setosa
## 48           4.6         3.2          1.4         0.2    setosa
## 49           5.3         3.7          1.5         0.2    setosa
## 50           5.0         3.3          1.4         0.2    setosa
## 101          6.3         3.3          6.0         2.5 virginica
## 102          5.8         2.7          5.1         1.9 virginica
## 103          7.1         3.0          5.9         2.1 virginica
## 104          6.3         2.9          5.6         1.8 virginica
## 105          6.5         3.0          5.8         2.2 virginica
## 106          7.6         3.0          6.6         2.1 virginica
## 107          4.9         2.5          4.5         1.7 virginica
## 108          7.3         2.9          6.3         1.8 virginica
## 109          6.7         2.5          5.8         1.8 virginica
## 110          7.2         3.6          6.1         2.5 virginica
## 111          6.5         3.2          5.1         2.0 virginica
## 112          6.4         2.7          5.3         1.9 virginica
## 113          6.8         3.0          5.5         2.1 virginica
## 114          5.7         2.5          5.0         2.0 virginica
## 115          5.8         2.8          5.1         2.4 virginica
## 116          6.4         3.2          5.3         2.3 virginica
## 117          6.5         3.0          5.5         1.8 virginica
## 118          7.7         3.8          6.7         2.2 virginica
## 119          7.7         2.6          6.9         2.3 virginica
## 120          6.0         2.2          5.0         1.5 virginica
## 121          6.9         3.2          5.7         2.3 virginica
## 122          5.6         2.8          4.9         2.0 virginica
## 123          7.7         2.8          6.7         2.0 virginica
## 124          6.3         2.7          4.9         1.8 virginica
## 125          6.7         3.3          5.7         2.1 virginica
## 126          7.2         3.2          6.0         1.8 virginica
## 127          6.2         2.8          4.8         1.8 virginica
## 128          6.1         3.0          4.9         1.8 virginica
## 129          6.4         2.8          5.6         2.1 virginica
## 130          7.2         3.0          5.8         1.6 virginica
## 131          7.4         2.8          6.1         1.9 virginica
## 132          7.9         3.8          6.4         2.0 virginica
## 133          6.4         2.8          5.6         2.2 virginica
## 134          6.3         2.8          5.1         1.5 virginica
## 135          6.1         2.6          5.6         1.4 virginica
## 136          7.7         3.0          6.1         2.3 virginica
## 137          6.3         3.4          5.6         2.4 virginica
## 138          6.4         3.1          5.5         1.8 virginica
## 139          6.0         3.0          4.8         1.8 virginica
## 140          6.9         3.1          5.4         2.1 virginica
## 141          6.7         3.1          5.6         2.4 virginica
## 142          6.9         3.1          5.1         2.3 virginica
## 143          5.8         2.7          5.1         1.9 virginica
## 144          6.8         3.2          5.9         2.3 virginica
## 145          6.7         3.3          5.7         2.5 virginica
## 146          6.7         3.0          5.2         2.3 virginica
## 147          6.3         2.5          5.0         1.9 virginica
## 148          6.5         3.0          5.2         2.0 virginica
## 149          6.2         3.4          5.4         2.3 virginica
## 150          5.9         3.0          5.1         1.8 virginica
# Often we would have some missing values that we have to find
iris_copy <- iris
make_missing <- sample(c(1,0),size = nrow(iris_copy), prob = c(0.10,0.90), replace = TRUE) # this samples 1, and 0 with replacement and says 1 has 90% chance of occurring and 0 - 10% chance of occurring 
iris_copy$Sepal.Length[as.logical(make_missing)] <- NA
(missing_values <- is.na(iris_copy$Sepal.Length))
##   [1] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [21] FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [41] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [81]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE
## [141] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
print(sum(missing_values)/nrow(iris_copy))
## [1] 0.09333333
# ok what is the mean length
mean(iris$Sepal.Length, na.rm = TRUE) # na.rm means ignore nan values
## [1] 5.843333
mean(iris$Sepal.Length[iris$Species == "setosa"], na.rm = TRUE)
## [1] 5.006
mean(iris[iris$Species == "setosa", "Petal.Length"], na.rm = TRUE)
## [1] 1.462
## ok but we can also achieve this with piping which is very common in the tidyverse
# piping is a common programming trick where we feed the result of one operation into another
iris$Sepal.Length %>% mean()
## [1] 5.843333
# This may look pointless but it can be quite helpful when you want to perform multiple functions together
# lets say we have repeats we may want to calculate the mean of unique values

mean(unique(iris$Sepal.Length))
## [1] 6.011429
iris$Sepal.Length %>% unique() %>% mean() %>% round(digits=3)
## [1] 6.011
## %>%  - can be done with ctrl + shift + m / or command + shift + m

# z <- 4
# w <- 3
# z %>% radius(w) # is the same as radius(z,w) - piped output is first argument
# z %>% radius(x = w, y = .) # is radius(w,z) - you can use the dot to specify place for piped output
## remember this

## subject_dirs <- list.dirs(main_dir, recursive = FALSE) %>%
##  keep(~ str_detect(basename(.x), "^sub-\\d+"))

# we are feeding the results of list.dirs the subject folders to another function that will apply operations to each element, but we are getting ahead of ourselves

Lets do some data wrangling - see https://rpubs.com/Earlien/Wrangling dplyr is a package that makes part of the larger tidyverse. dplyr is a grammar of data manipulation, providing a consistent set of verbs that help you solve the most common data manipulation challenges. - https://dplyr.tidyverse.org/ As you can imagine base R can do everything that tidyverse can, but it can become a bit hairy when we want basic data operations that are common

library(dplyr)
# lets say we may want to select only certain columns - select is a good way to do this

iris_select <- iris %>% select(Species, Petal.Length, Petal.Width)
head(iris_select)
##   Species Petal.Length Petal.Width
## 1  setosa          1.4         0.2
## 2  setosa          1.4         0.2
## 3  setosa          1.3         0.2
## 4  setosa          1.5         0.2
## 5  setosa          1.4         0.2
## 6  setosa          1.7         0.4
# ok but what if we do not want to list all the names e.g., imagine you have more than 20 columns - you can use some helper functions

iris_select <- iris %>% select(Species, matches("Width")) # any coulmn that contains Width
head(iris_select)
##   Species Sepal.Width Petal.Width
## 1  setosa         3.5         0.2
## 2  setosa         3.0         0.2
## 3  setosa         3.2         0.2
## 4  setosa         3.1         0.2
## 5  setosa         3.6         0.2
## 6  setosa         3.9         0.4
iris_select <- iris %>% select(Species, contains("Sepal")) # any coulmn that contains Width
head(iris_select)
##   Species Sepal.Length Sepal.Width
## 1  setosa          5.1         3.5
## 2  setosa          4.9         3.0
## 3  setosa          4.7         3.2
## 4  setosa          4.6         3.1
## 5  setosa          5.0         3.6
## 6  setosa          5.4         3.9
iris_select <- iris %>% select(Species, starts_with("Petal")) 
head(iris_select)
##   Species Petal.Length Petal.Width
## 1  setosa          1.4         0.2
## 2  setosa          1.4         0.2
## 3  setosa          1.3         0.2
## 4  setosa          1.5         0.2
## 5  setosa          1.4         0.2
## 6  setosa          1.7         0.4
iris_select <- iris %>% select(Species, ends_with("Length")) 
head(iris_select)
##   Species Sepal.Length Petal.Length
## 1  setosa          5.1          1.4
## 2  setosa          4.9          1.4
## 3  setosa          4.7          1.3
## 4  setosa          4.6          1.5
## 5  setosa          5.0          1.4
## 6  setosa          5.4          1.7
iris %>% select(matches("[A-Z]\\."))# any column that has capital letter followed by a dot 
##     Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1            5.1         3.5          1.4         0.2
## 2            4.9         3.0          1.4         0.2
## 3            4.7         3.2          1.3         0.2
## 4            4.6         3.1          1.5         0.2
## 5            5.0         3.6          1.4         0.2
## 6            5.4         3.9          1.7         0.4
## 7            4.6         3.4          1.4         0.3
## 8            5.0         3.4          1.5         0.2
## 9            4.4         2.9          1.4         0.2
## 10           4.9         3.1          1.5         0.1
## 11           5.4         3.7          1.5         0.2
## 12           4.8         3.4          1.6         0.2
## 13           4.8         3.0          1.4         0.1
## 14           4.3         3.0          1.1         0.1
## 15           5.8         4.0          1.2         0.2
## 16           5.7         4.4          1.5         0.4
## 17           5.4         3.9          1.3         0.4
## 18           5.1         3.5          1.4         0.3
## 19           5.7         3.8          1.7         0.3
## 20           5.1         3.8          1.5         0.3
## 21           5.4         3.4          1.7         0.2
## 22           5.1         3.7          1.5         0.4
## 23           4.6         3.6          1.0         0.2
## 24           5.1         3.3          1.7         0.5
## 25           4.8         3.4          1.9         0.2
## 26           5.0         3.0          1.6         0.2
## 27           5.0         3.4          1.6         0.4
## 28           5.2         3.5          1.5         0.2
## 29           5.2         3.4          1.4         0.2
## 30           4.7         3.2          1.6         0.2
## 31           4.8         3.1          1.6         0.2
## 32           5.4         3.4          1.5         0.4
## 33           5.2         4.1          1.5         0.1
## 34           5.5         4.2          1.4         0.2
## 35           4.9         3.1          1.5         0.2
## 36           5.0         3.2          1.2         0.2
## 37           5.5         3.5          1.3         0.2
## 38           4.9         3.6          1.4         0.1
## 39           4.4         3.0          1.3         0.2
## 40           5.1         3.4          1.5         0.2
## 41           5.0         3.5          1.3         0.3
## 42           4.5         2.3          1.3         0.3
## 43           4.4         3.2          1.3         0.2
## 44           5.0         3.5          1.6         0.6
## 45           5.1         3.8          1.9         0.4
## 46           4.8         3.0          1.4         0.3
## 47           5.1         3.8          1.6         0.2
## 48           4.6         3.2          1.4         0.2
## 49           5.3         3.7          1.5         0.2
## 50           5.0         3.3          1.4         0.2
## 51           7.0         3.2          4.7         1.4
## 52           6.4         3.2          4.5         1.5
## 53           6.9         3.1          4.9         1.5
## 54           5.5         2.3          4.0         1.3
## 55           6.5         2.8          4.6         1.5
## 56           5.7         2.8          4.5         1.3
## 57           6.3         3.3          4.7         1.6
## 58           4.9         2.4          3.3         1.0
## 59           6.6         2.9          4.6         1.3
## 60           5.2         2.7          3.9         1.4
## 61           5.0         2.0          3.5         1.0
## 62           5.9         3.0          4.2         1.5
## 63           6.0         2.2          4.0         1.0
## 64           6.1         2.9          4.7         1.4
## 65           5.6         2.9          3.6         1.3
## 66           6.7         3.1          4.4         1.4
## 67           5.6         3.0          4.5         1.5
## 68           5.8         2.7          4.1         1.0
## 69           6.2         2.2          4.5         1.5
## 70           5.6         2.5          3.9         1.1
## 71           5.9         3.2          4.8         1.8
## 72           6.1         2.8          4.0         1.3
## 73           6.3         2.5          4.9         1.5
## 74           6.1         2.8          4.7         1.2
## 75           6.4         2.9          4.3         1.3
## 76           6.6         3.0          4.4         1.4
## 77           6.8         2.8          4.8         1.4
## 78           6.7         3.0          5.0         1.7
## 79           6.0         2.9          4.5         1.5
## 80           5.7         2.6          3.5         1.0
## 81           5.5         2.4          3.8         1.1
## 82           5.5         2.4          3.7         1.0
## 83           5.8         2.7          3.9         1.2
## 84           6.0         2.7          5.1         1.6
## 85           5.4         3.0          4.5         1.5
## 86           6.0         3.4          4.5         1.6
## 87           6.7         3.1          4.7         1.5
## 88           6.3         2.3          4.4         1.3
## 89           5.6         3.0          4.1         1.3
## 90           5.5         2.5          4.0         1.3
## 91           5.5         2.6          4.4         1.2
## 92           6.1         3.0          4.6         1.4
## 93           5.8         2.6          4.0         1.2
## 94           5.0         2.3          3.3         1.0
## 95           5.6         2.7          4.2         1.3
## 96           5.7         3.0          4.2         1.2
## 97           5.7         2.9          4.2         1.3
## 98           6.2         2.9          4.3         1.3
## 99           5.1         2.5          3.0         1.1
## 100          5.7         2.8          4.1         1.3
## 101          6.3         3.3          6.0         2.5
## 102          5.8         2.7          5.1         1.9
## 103          7.1         3.0          5.9         2.1
## 104          6.3         2.9          5.6         1.8
## 105          6.5         3.0          5.8         2.2
## 106          7.6         3.0          6.6         2.1
## 107          4.9         2.5          4.5         1.7
## 108          7.3         2.9          6.3         1.8
## 109          6.7         2.5          5.8         1.8
## 110          7.2         3.6          6.1         2.5
## 111          6.5         3.2          5.1         2.0
## 112          6.4         2.7          5.3         1.9
## 113          6.8         3.0          5.5         2.1
## 114          5.7         2.5          5.0         2.0
## 115          5.8         2.8          5.1         2.4
## 116          6.4         3.2          5.3         2.3
## 117          6.5         3.0          5.5         1.8
## 118          7.7         3.8          6.7         2.2
## 119          7.7         2.6          6.9         2.3
## 120          6.0         2.2          5.0         1.5
## 121          6.9         3.2          5.7         2.3
## 122          5.6         2.8          4.9         2.0
## 123          7.7         2.8          6.7         2.0
## 124          6.3         2.7          4.9         1.8
## 125          6.7         3.3          5.7         2.1
## 126          7.2         3.2          6.0         1.8
## 127          6.2         2.8          4.8         1.8
## 128          6.1         3.0          4.9         1.8
## 129          6.4         2.8          5.6         2.1
## 130          7.2         3.0          5.8         1.6
## 131          7.4         2.8          6.1         1.9
## 132          7.9         3.8          6.4         2.0
## 133          6.4         2.8          5.6         2.2
## 134          6.3         2.8          5.1         1.5
## 135          6.1         2.6          5.6         1.4
## 136          7.7         3.0          6.1         2.3
## 137          6.3         3.4          5.6         2.4
## 138          6.4         3.1          5.5         1.8
## 139          6.0         3.0          4.8         1.8
## 140          6.9         3.1          5.4         2.1
## 141          6.7         3.1          5.6         2.4
## 142          6.9         3.1          5.1         2.3
## 143          5.8         2.7          5.1         1.9
## 144          6.8         3.2          5.9         2.3
## 145          6.7         3.3          5.7         2.5
## 146          6.7         3.0          5.2         2.3
## 147          6.3         2.5          5.0         1.9
## 148          6.5         3.0          5.2         2.0
## 149          6.2         3.4          5.4         2.3
## 150          5.9         3.0          5.1         1.8

Ok we can select columns cool, can we do something with them - yes with mutate which allows one to create new columns or operate on columns based on values of other columns

iris_select <- iris %>% 
  select(Species, Petal.Length, Petal.Width) %>% 
  mutate(Times.Bigger = (Petal.Length/Petal.Width)) # Create new column, set = to condition/function

iris_select <- iris %>% 
  select(Species, Petal.Length, Petal.Width) %>% 
  mutate(Times.Bigger = (Petal.Length/Petal.Width),
         Log.Length = log(Petal.Length+1))
head(iris_select)
##   Species Petal.Length Petal.Width Times.Bigger Log.Length
## 1  setosa          1.4         0.2         7.00  0.8754687
## 2  setosa          1.4         0.2         7.00  0.8754687
## 3  setosa          1.3         0.2         6.50  0.8329091
## 4  setosa          1.5         0.2         7.50  0.9162907
## 5  setosa          1.4         0.2         7.00  0.8754687
## 6  setosa          1.7         0.4         4.25  0.9932518
# BASE R equivalent
iris_copy <- iris
iris_copy$Time.Bigger = iris_copy$Petal.Length / iris_copy$Petal.Width
head(iris_copy)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species Time.Bigger
## 1          5.1         3.5          1.4         0.2  setosa        7.00
## 2          4.9         3.0          1.4         0.2  setosa        7.00
## 3          4.7         3.2          1.3         0.2  setosa        6.50
## 4          4.6         3.1          1.5         0.2  setosa        7.50
## 5          5.0         3.6          1.4         0.2  setosa        7.00
## 6          5.4         3.9          1.7         0.4  setosa        4.25

How can we work on specific values - filter, we can condition on column values, e.g., grab only setosa species or only species above certain length

setosa_small <- iris %>%         # create from iris dataset
  filter(Species == "setosa" & Sepal.Length < 5) # filter for species name and sepal length under 5

filter(iris, Petal.Length > 6 & Sepal.Length > 7)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 1          7.6         3.0          6.6         2.1 virginica
## 2          7.3         2.9          6.3         1.8 virginica
## 3          7.2         3.6          6.1         2.5 virginica
## 4          7.7         3.8          6.7         2.2 virginica
## 5          7.7         2.6          6.9         2.3 virginica
## 6          7.7         2.8          6.7         2.0 virginica
## 7          7.4         2.8          6.1         1.9 virginica
## 8          7.9         3.8          6.4         2.0 virginica
## 9          7.7         3.0          6.1         2.3 virginica
head(setosa_small) # view the new dataset
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          4.9         3.0          1.4         0.2  setosa
## 2          4.7         3.2          1.3         0.2  setosa
## 3          4.6         3.1          1.5         0.2  setosa
## 4          4.6         3.4          1.4         0.3  setosa
## 5          4.4         2.9          1.4         0.2  setosa
## 6          4.9         3.1          1.5         0.1  setosa
# we can actually combine the filter with mutate which is often usefule

iris_flagged <- iris %>% mutate(TallPlants = Sepal.Length > 5.8 | Petal.Length > 3.5)
head(iris_flagged %>% filter(Sepal.Length > 5.8 | Petal.Length > 3.5))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species TallPlants
## 1          7.0         3.2          4.7         1.4 versicolor       TRUE
## 2          6.4         3.2          4.5         1.5 versicolor       TRUE
## 3          6.9         3.1          4.9         1.5 versicolor       TRUE
## 4          5.5         2.3          4.0         1.3 versicolor       TRUE
## 5          6.5         2.8          4.6         1.5 versicolor       TRUE
## 6          5.7         2.8          4.5         1.3 versicolor       TRUE
# One can use case_when to have not simply TRUE, FALSE values which can be useful when creating new variables
# case when is like a vectorised if_else statements


iris_flagged <- iris %>% mutate(TallPlants = 
                                  case_when(Sepal.Length > 5.8 | Petal.Length > 3.5 ~ "Tall",
                                            TRUE ~ "Short"))

## case_when can be used with multiple conditions and can be thought of as 
# if (condition) {
# do x
#} else { do y}

iris_flagged <- iris %>% mutate(TallPlants = 
                                  case_when(Sepal.Length > 5.8 | Petal.Length > 3.5 ~ "Tall",
                                            Sepal.Length < 5.8 & Sepal.Length > 5 ~ "Medium",
                                            TRUE ~ "Short"))

summrise is similar to psych::describe() we saw above in basic r you can use aggregate the basic idea is you can create a new dataframe/ tibble that describes the data, although with the current data this may not appear very useful, it can be very helpful when you have multi-level data, e.g., multile trials, from multiple subjects, we will see this later on in its simplest form it just provides the mean of a column

iris %>% summarise(mean(Sepal.Width))
##   mean(Sepal.Width)
## 1          3.057333
iris %>% summarise(mean(Sepal.Width), n = n()) # how many observations 
##   mean(Sepal.Width)   n
## 1          3.057333 150
iris %>% summarise(mean(Sepal.Width), max(Sepal.Width), median(Sepal.Width),n = n())
##   mean(Sepal.Width) max(Sepal.Width) median(Sepal.Width)   n
## 1          3.057333              4.4                   3 150
# base R similar way
aggregate(Sepal.Width~1, data = iris,mean)
##   Sepal.Width
## 1    3.057333
agg_Df<- aggregate(Sepal.Width~1, data = iris,FUN = function(x) c(mean = mean(x), sd = sd(x)))
head(agg_Df)
##   Sepal.Width.mean Sepal.Width.sd
## 1        3.0573333      0.4358663
# some people use apply which applies a function to each element of data.frame - apply(iris[,1:4],2,mean) # 2 specifies that it should be applied over columns

# a benefit of summarise is that we can use group_by to calculate descripives per group
#The group_by() function allows us to break up our dataset by a specific value. Above we wanted to see the mean, max, and min values of sepa width by species. We can use group_by(Species) for this to work.

iris %>%  group_by(Species) %>% summarise(mean(Sepal.Width), max(Sepal.Width), median(Sepal.Width),n = n())
## # A tibble: 3 × 5
##   Species    `mean(Sepal.Width)` `max(Sepal.Width)` `median(Sepal.Width)`     n
##   <fct>                    <dbl>              <dbl>                 <dbl> <int>
## 1 setosa                    3.43                4.4                   3.4    50
## 2 versicolor                2.77                3.4                   2.8    50
## 3 virginica                 2.97                3.8                   3      50
# we can also arrange data
# 
# arrange() allows us to rearrange our data based on certain conditions. Let’s create a new dataset called virginica that contains only the virginica species, has sepal widths greater than 2.9, and is arranged in descending sepal lengths (we will use the desc() function).
# 
# NOTE: desc() stands for descending. It will places the rows in descending order.

iris_new <- iris %>% # call from iris data
  filter(Species == "virginica" & Sepal.Width > 2.9) %>% # filter species and sepal width
  arrange(desc(Sepal.Length)) # arrange the data by descending values in sepal length

iris_new # view the new data frame
##    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 1           7.9         3.8          6.4         2.0 virginica
## 2           7.7         3.8          6.7         2.2 virginica
## 3           7.7         3.0          6.1         2.3 virginica
## 4           7.6         3.0          6.6         2.1 virginica
## 5           7.2         3.6          6.1         2.5 virginica
## 6           7.2         3.2          6.0         1.8 virginica
## 7           7.2         3.0          5.8         1.6 virginica
## 8           7.1         3.0          5.9         2.1 virginica
## 9           6.9         3.2          5.7         2.3 virginica
## 10          6.9         3.1          5.4         2.1 virginica
## 11          6.9         3.1          5.1         2.3 virginica
## 12          6.8         3.0          5.5         2.1 virginica
## 13          6.8         3.2          5.9         2.3 virginica
## 14          6.7         3.3          5.7         2.1 virginica
## 15          6.7         3.1          5.6         2.4 virginica
## 16          6.7         3.3          5.7         2.5 virginica
## 17          6.7         3.0          5.2         2.3 virginica
## 18          6.5         3.0          5.8         2.2 virginica
## 19          6.5         3.2          5.1         2.0 virginica
## 20          6.5         3.0          5.5         1.8 virginica
## 21          6.5         3.0          5.2         2.0 virginica
## 22          6.4         3.2          5.3         2.3 virginica
## 23          6.4         3.1          5.5         1.8 virginica
## 24          6.3         3.3          6.0         2.5 virginica
## 25          6.3         3.4          5.6         2.4 virginica
## 26          6.2         3.4          5.4         2.3 virginica
## 27          6.1         3.0          4.9         1.8 virginica
## 28          6.0         3.0          4.8         1.8 virginica
## 29          5.9         3.0          5.1         1.8 virginica
psych::describeBy(iris, group="Species")
## 
##  Descriptive statistics by group 
## Species: setosa
##              vars  n mean   sd median trimmed  mad min max range skew kurtosis   se
## Sepal.Length    1 50 5.01 0.35    5.0    5.00 0.30 4.3 5.8   1.5 0.11    -0.45 0.05
## Sepal.Width     2 50 3.43 0.38    3.4    3.42 0.37 2.3 4.4   2.1 0.04     0.60 0.05
## Petal.Length    3 50 1.46 0.17    1.5    1.46 0.15 1.0 1.9   0.9 0.10     0.65 0.02
## Petal.Width     4 50 0.25 0.11    0.2    0.24 0.00 0.1 0.6   0.5 1.18     1.26 0.01
## Species*        5 50 1.00 0.00    1.0    1.00 0.00 1.0 1.0   0.0  NaN      NaN 0.00
## ---------------------------------------------------------------------------------------------- 
## Species: versicolor
##              vars  n mean   sd median trimmed  mad min max range  skew kurtosis   se
## Sepal.Length    1 50 5.94 0.52   5.90    5.94 0.52 4.9 7.0   2.1  0.10    -0.69 0.07
## Sepal.Width     2 50 2.77 0.31   2.80    2.78 0.30 2.0 3.4   1.4 -0.34    -0.55 0.04
## Petal.Length    3 50 4.26 0.47   4.35    4.29 0.52 3.0 5.1   2.1 -0.57    -0.19 0.07
## Petal.Width     4 50 1.33 0.20   1.30    1.32 0.22 1.0 1.8   0.8 -0.03    -0.59 0.03
## Species*        5 50 2.00 0.00   2.00    2.00 0.00 2.0 2.0   0.0   NaN      NaN 0.00
## ---------------------------------------------------------------------------------------------- 
## Species: virginica
##              vars  n mean   sd median trimmed  mad min max range  skew kurtosis   se
## Sepal.Length    1 50 6.59 0.64   6.50    6.57 0.59 4.9 7.9   3.0  0.11    -0.20 0.09
## Sepal.Width     2 50 2.97 0.32   3.00    2.96 0.30 2.2 3.8   1.6  0.34     0.38 0.05
## Petal.Length    3 50 5.55 0.55   5.55    5.51 0.67 4.5 6.9   2.4  0.52    -0.37 0.08
## Petal.Width     4 50 2.03 0.27   2.00    2.03 0.30 1.4 2.5   1.1 -0.12    -0.75 0.04
## Species*        5 50 3.00 0.00   3.00    3.00 0.00 3.0 3.0   0.0   NaN      NaN 0.00

We often want to rename columns - rename / rename_with; or if we want to change the names of values within column - recode

print(names(iris_flagged))
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"      "TallPlants"
iris_flagged = iris_flagged %>% rename(Height := TallPlants)
print(names(iris_flagged))
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"      "Height"
iris %>%
    rename_with(~ tolower(gsub("\\.", "_", .x)), .cols = everything())
##     sepal_length sepal_width petal_length petal_width    species
## 1            5.1         3.5          1.4         0.2     setosa
## 2            4.9         3.0          1.4         0.2     setosa
## 3            4.7         3.2          1.3         0.2     setosa
## 4            4.6         3.1          1.5         0.2     setosa
## 5            5.0         3.6          1.4         0.2     setosa
## 6            5.4         3.9          1.7         0.4     setosa
## 7            4.6         3.4          1.4         0.3     setosa
## 8            5.0         3.4          1.5         0.2     setosa
## 9            4.4         2.9          1.4         0.2     setosa
## 10           4.9         3.1          1.5         0.1     setosa
## 11           5.4         3.7          1.5         0.2     setosa
## 12           4.8         3.4          1.6         0.2     setosa
## 13           4.8         3.0          1.4         0.1     setosa
## 14           4.3         3.0          1.1         0.1     setosa
## 15           5.8         4.0          1.2         0.2     setosa
## 16           5.7         4.4          1.5         0.4     setosa
## 17           5.4         3.9          1.3         0.4     setosa
## 18           5.1         3.5          1.4         0.3     setosa
## 19           5.7         3.8          1.7         0.3     setosa
## 20           5.1         3.8          1.5         0.3     setosa
## 21           5.4         3.4          1.7         0.2     setosa
## 22           5.1         3.7          1.5         0.4     setosa
## 23           4.6         3.6          1.0         0.2     setosa
## 24           5.1         3.3          1.7         0.5     setosa
## 25           4.8         3.4          1.9         0.2     setosa
## 26           5.0         3.0          1.6         0.2     setosa
## 27           5.0         3.4          1.6         0.4     setosa
## 28           5.2         3.5          1.5         0.2     setosa
## 29           5.2         3.4          1.4         0.2     setosa
## 30           4.7         3.2          1.6         0.2     setosa
## 31           4.8         3.1          1.6         0.2     setosa
## 32           5.4         3.4          1.5         0.4     setosa
## 33           5.2         4.1          1.5         0.1     setosa
## 34           5.5         4.2          1.4         0.2     setosa
## 35           4.9         3.1          1.5         0.2     setosa
## 36           5.0         3.2          1.2         0.2     setosa
## 37           5.5         3.5          1.3         0.2     setosa
## 38           4.9         3.6          1.4         0.1     setosa
## 39           4.4         3.0          1.3         0.2     setosa
## 40           5.1         3.4          1.5         0.2     setosa
## 41           5.0         3.5          1.3         0.3     setosa
## 42           4.5         2.3          1.3         0.3     setosa
## 43           4.4         3.2          1.3         0.2     setosa
## 44           5.0         3.5          1.6         0.6     setosa
## 45           5.1         3.8          1.9         0.4     setosa
## 46           4.8         3.0          1.4         0.3     setosa
## 47           5.1         3.8          1.6         0.2     setosa
## 48           4.6         3.2          1.4         0.2     setosa
## 49           5.3         3.7          1.5         0.2     setosa
## 50           5.0         3.3          1.4         0.2     setosa
## 51           7.0         3.2          4.7         1.4 versicolor
## 52           6.4         3.2          4.5         1.5 versicolor
## 53           6.9         3.1          4.9         1.5 versicolor
## 54           5.5         2.3          4.0         1.3 versicolor
## 55           6.5         2.8          4.6         1.5 versicolor
## 56           5.7         2.8          4.5         1.3 versicolor
## 57           6.3         3.3          4.7         1.6 versicolor
## 58           4.9         2.4          3.3         1.0 versicolor
## 59           6.6         2.9          4.6         1.3 versicolor
## 60           5.2         2.7          3.9         1.4 versicolor
## 61           5.0         2.0          3.5         1.0 versicolor
## 62           5.9         3.0          4.2         1.5 versicolor
## 63           6.0         2.2          4.0         1.0 versicolor
## 64           6.1         2.9          4.7         1.4 versicolor
## 65           5.6         2.9          3.6         1.3 versicolor
## 66           6.7         3.1          4.4         1.4 versicolor
## 67           5.6         3.0          4.5         1.5 versicolor
## 68           5.8         2.7          4.1         1.0 versicolor
## 69           6.2         2.2          4.5         1.5 versicolor
## 70           5.6         2.5          3.9         1.1 versicolor
## 71           5.9         3.2          4.8         1.8 versicolor
## 72           6.1         2.8          4.0         1.3 versicolor
## 73           6.3         2.5          4.9         1.5 versicolor
## 74           6.1         2.8          4.7         1.2 versicolor
## 75           6.4         2.9          4.3         1.3 versicolor
## 76           6.6         3.0          4.4         1.4 versicolor
## 77           6.8         2.8          4.8         1.4 versicolor
## 78           6.7         3.0          5.0         1.7 versicolor
## 79           6.0         2.9          4.5         1.5 versicolor
## 80           5.7         2.6          3.5         1.0 versicolor
## 81           5.5         2.4          3.8         1.1 versicolor
## 82           5.5         2.4          3.7         1.0 versicolor
## 83           5.8         2.7          3.9         1.2 versicolor
## 84           6.0         2.7          5.1         1.6 versicolor
## 85           5.4         3.0          4.5         1.5 versicolor
## 86           6.0         3.4          4.5         1.6 versicolor
## 87           6.7         3.1          4.7         1.5 versicolor
## 88           6.3         2.3          4.4         1.3 versicolor
## 89           5.6         3.0          4.1         1.3 versicolor
## 90           5.5         2.5          4.0         1.3 versicolor
## 91           5.5         2.6          4.4         1.2 versicolor
## 92           6.1         3.0          4.6         1.4 versicolor
## 93           5.8         2.6          4.0         1.2 versicolor
## 94           5.0         2.3          3.3         1.0 versicolor
## 95           5.6         2.7          4.2         1.3 versicolor
## 96           5.7         3.0          4.2         1.2 versicolor
## 97           5.7         2.9          4.2         1.3 versicolor
## 98           6.2         2.9          4.3         1.3 versicolor
## 99           5.1         2.5          3.0         1.1 versicolor
## 100          5.7         2.8          4.1         1.3 versicolor
## 101          6.3         3.3          6.0         2.5  virginica
## 102          5.8         2.7          5.1         1.9  virginica
## 103          7.1         3.0          5.9         2.1  virginica
## 104          6.3         2.9          5.6         1.8  virginica
## 105          6.5         3.0          5.8         2.2  virginica
## 106          7.6         3.0          6.6         2.1  virginica
## 107          4.9         2.5          4.5         1.7  virginica
## 108          7.3         2.9          6.3         1.8  virginica
## 109          6.7         2.5          5.8         1.8  virginica
## 110          7.2         3.6          6.1         2.5  virginica
## 111          6.5         3.2          5.1         2.0  virginica
## 112          6.4         2.7          5.3         1.9  virginica
## 113          6.8         3.0          5.5         2.1  virginica
## 114          5.7         2.5          5.0         2.0  virginica
## 115          5.8         2.8          5.1         2.4  virginica
## 116          6.4         3.2          5.3         2.3  virginica
## 117          6.5         3.0          5.5         1.8  virginica
## 118          7.7         3.8          6.7         2.2  virginica
## 119          7.7         2.6          6.9         2.3  virginica
## 120          6.0         2.2          5.0         1.5  virginica
## 121          6.9         3.2          5.7         2.3  virginica
## 122          5.6         2.8          4.9         2.0  virginica
## 123          7.7         2.8          6.7         2.0  virginica
## 124          6.3         2.7          4.9         1.8  virginica
## 125          6.7         3.3          5.7         2.1  virginica
## 126          7.2         3.2          6.0         1.8  virginica
## 127          6.2         2.8          4.8         1.8  virginica
## 128          6.1         3.0          4.9         1.8  virginica
## 129          6.4         2.8          5.6         2.1  virginica
## 130          7.2         3.0          5.8         1.6  virginica
## 131          7.4         2.8          6.1         1.9  virginica
## 132          7.9         3.8          6.4         2.0  virginica
## 133          6.4         2.8          5.6         2.2  virginica
## 134          6.3         2.8          5.1         1.5  virginica
## 135          6.1         2.6          5.6         1.4  virginica
## 136          7.7         3.0          6.1         2.3  virginica
## 137          6.3         3.4          5.6         2.4  virginica
## 138          6.4         3.1          5.5         1.8  virginica
## 139          6.0         3.0          4.8         1.8  virginica
## 140          6.9         3.1          5.4         2.1  virginica
## 141          6.7         3.1          5.6         2.4  virginica
## 142          6.9         3.1          5.1         2.3  virginica
## 143          5.8         2.7          5.1         1.9  virginica
## 144          6.8         3.2          5.9         2.3  virginica
## 145          6.7         3.3          5.7         2.5  virginica
## 146          6.7         3.0          5.2         2.3  virginica
## 147          6.3         2.5          5.0         1.9  virginica
## 148          6.5         3.0          5.2         2.0  virginica
## 149          6.2         3.4          5.4         2.3  virginica
## 150          5.9         3.0          5.1         1.8  virginica
#  function(.x) {
#   tolower(gsub("\\.", "_", .x))
# }

# with ~ we are creating a function but not asigning it to a variable - another short hand
# https://coolbutuseless.github.io/2019/03/13/anonymous-functions-in-r-part-1/


# NOTE although the original data had name Species, now because of tolower we use species
iris_clean = iris %>%
    rename_with(~ tolower(gsub("\\.", "_", .x)), .cols = everything()) %>% 
    mutate(species_new = recode(species, 
                                "setosa" = "A",
                                "versicolor" = "B",
                                "virginica"= "C"))

head(iris_clean)
##   sepal_length sepal_width petal_length petal_width species species_new
## 1          5.1         3.5          1.4         0.2  setosa           A
## 2          4.9         3.0          1.4         0.2  setosa           A
## 3          4.7         3.2          1.3         0.2  setosa           A
## 4          4.6         3.1          1.5         0.2  setosa           A
## 5          5.0         3.6          1.4         0.2  setosa           A
## 6          5.4         3.9          1.7         0.4  setosa           A

simple visualisations

plot(iris) # not pretty but quick it can quickly select the plot you might want e.g., here does pairs plot or a scatter plot for each pair of variables

hist(iris$Sepal.Length)

boxplot(iris$Sepal.Length ~ iris$Species) # first y axis then x axis  # can check for outliers

# lattice::dotplot(iris$Sepal.Length ~ iris$Species)
plot(iris$Sepal.Length ~ iris$Sepal.Width)

Visualization - the plotting can be done with basic plots but most people use the ggplot library which can create a bit prettier plots

“ggplot2 is designed to work iteratively. You start with a layer that shows the raw data. Then you add layers of annotations and statistical summaries. This allows you to produce graphics using the same structured thinking that you would use to design an analysis. This reduces the distance between the plot in your head and the one on the page. This is especially helpful for students who have not yet developed the structured approach to analysis used by experts.”

ggplot2 book - https://ggplot2-book.org/getting-started.html

A plot in ggplot2 is described in three parts:

Aesthetics: Relationship between data and visual properties that define working space of the plot (which variables map on individual axes, color, size, fill, etc.). Geometrical primitives that visualize your data (points, lines, error bars, etc.) that are added to the plot. Other properties of the plot (scaling of axes, labels, annotations, etc.) that are added to the plot.

Every ggplot2 plot has three key components:

data,

A set of aesthetic mappings between variables in the data and visual properties, and

At least one layer which describes how to render each observation. Layers are usually created with a geom function.

ggplot2 Main Logic ggplot2 follows a layered grammar of graphics:

Start with data + aesthetics: ggplot(data, aes(x = …, y = …))

Add geoms (visual elements): + geom_point(), geom_line(), geom_bar() Map additional aesthetics (color, shape, size):

Think of it as building a plot step-by-step, like layers on a cake.

library(ggplot2)
library(patchwork)
ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length, col = Species)) + geom_point()

# notice how the data argument is not needed
# iris %>% ggplot(aes(x = Sepal.Width, y = Sepal.Length, col = Species)) + geom_point()


ggplot(iris, aes(x = Petal.Length, y = Petal.Width, col=Species)) +
  geom_point() +
  # a linear regression over all dots in the group
  geom_smooth(method="lm", formula = y ~ x, se=TRUE, linetype="dashed") 

ggplot(iris, aes(x = Petal.Length, y = Petal.Width, col=Species)) +
  geom_point() +
  # Group-specific regressions
  geom_smooth(method = "lm", formula = y ~ x, se=TRUE, linetype="dashed") +
  # Global regression overlay
  geom_smooth(
    aes(group = 1),     # 👈 tells ggplot to treat all data as one group
    method = "lm",
    formula = y ~ x,
    se = TRUE,
    color = "black",    # make it stand out (or whatever color you want)
    linewidth = 1.2,
    linetype = "solid"  # solid line for global fit
  )

## what if we want each Species to be a separate plot

ggplot(iris, aes(x = Petal.Length, y = Petal.Width, col=Species)) +
  geom_point() +
  geom_smooth(
    method = "lm",
    formula = y ~ x,
    se = TRUE,
  ) + facet_grid(. ~ Species)  + # makes a separate subplot for each group
  theme(legend.position = "none") # we don't need the legend as conditions are split between facets 

# Grab default ggplot colors and apply alpha
# default_colors <- scales::hue_pal()(3)  # get 3 default colors
# transparent_colors <- alpha(default_colors, 0.5)
# 
# ggplot(iris, aes(x = Petal.Length, y = Petal.Width, color = Species)) +
#   geom_point() +
#   geom_smooth(method = "lm", formula = y ~ x, se = FALSE, linetype = "dashed") +
#   scale_color_manual(values = transparent_colors) +
#   theme_minimal()

Often we need more than lines on a grid, so how do we change labels or specify certain colors for groups of variables

iris_flagged <- iris %>% mutate(TallPlants = 
                                  case_when(Sepal.Length > 5.8 | Petal.Length > 3.5 ~ "Tall",
                                            Sepal.Length < 5.8 & Sepal.Length > 5 ~ "Medium",
                                            TRUE ~ "Short"))


ggplot(iris_flagged, aes(x = Petal.Length, y = Petal.Width)) +
  geom_point(aes(color = TallPlants, shape = TallPlants), size = 3) +
  labs(
    title = "Iris: Tall, Medium, and Short Plants",
    x = "Petal Length",
    y = "Petal Width",
    color = "Plant Height",
    shape = "Plant Height"
  ) +
  theme_minimal()

ggplot(iris_flagged, aes(x = Petal.Length, y = Petal.Width)) +
  geom_point(aes(color = TallPlants, shape = TallPlants), size = 3) + scale_color_manual(values = c("Tall" = "red", "Medium" = "orange", "Short" = "gray")) +
  labs(
    title = "Iris: Tall, Medium, and Short Plants",
    x = "Petal Length",
    y = "Petal Width",
    color = "Plant Height",
    shape = "Plant Height"
  ) +
  theme_minimal()

iris %>% ggplot(aes(x= Petal.Length, y = Petal.Width)) +geom_point(color="blue") 

iris %>% ggplot(aes(Petal.Length)) + geom_histogram() + theme_minimal() # lets remove the annoying grid
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

P<- iris %>% ggplot(aes(Petal.Length)) + geom_histogram(fill="skyblue", color = "black", bins = 30) + theme_minimal() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + # lets remove the annoying grid
  theme(text = element_text(size = 14)) + # increase the text size
  labs(title = "Distribution of Petal Length in Iris Dataset", 
       x = "Petal Length (cm)",
    y = "Count")

P

ggsave('./Histogram_Petal_Length.png', P, width = 15, height = 8, dpi = 300)


## ok but often we might want more control over text elements

iris %>% ggplot(aes(Petal.Length)) + geom_histogram(fill="skyblue", color = "black", bins = 30) + theme_minimal() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + # lets remove the annoying grid
    theme(
    plot.title = element_text(size = 20, face = "bold", hjust = 0.5), # center title and make it bold and tell it explicitly what size it should be
    axis.title.x = element_text(size = 14), # xlabel
    axis.title.y = element_text(size = 18), # ylabel
    axis.text.y = element_text(size = 12), # yticklabels
    axis.text.x = element_text(size = 10), # xticklabels
    text = element_text(family = "serif")  # applies to all labels by default
  ) +
  labs(title = "Distribution of Petal Length in Iris Dataset", 
       x = "Petal Length (cm)",
    y = "Count")

## Use above theme object to manipulate your figures as you wish

## what if we want to examine histogram grouped by species
iris %>% ggplot(aes(Petal.Length, fill = Species)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

https://rstudio.github.io/cheatsheets/html/data-visualization.html

selection of GGPLOTs - http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html

GGPLOT book - https://ggplot2-book.org/

library(ggExtra)
g<- ggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +
  geom_point() +
  # a linear regression over all dots in the group
  geom_smooth(method="lm", formula = y ~ x, se=TRUE, linetype="dashed") 

ggMarginal(g, type = "histogram", fill="transparent")

library(corrplot)
## corrplot 0.84 loaded
iris %>% select(where(is.numeric)) %>% cor() %>% corrplot(, type='lower',diag = FALSE, method='number', col = colorRampPalette(c("blue", "white", "red"))(200))

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
iris %>% select(where(is.numeric)) %>%   rename(
    S_length = Sepal.Length,
    S_width = Sepal.Width,
    P_length = Petal.Length,
    P_width = Petal.Width
  )%>% ggpairs( lower = list(continuous = "points", combo = "dot_no_facet"),progress=FALSE,title='Iris') + theme_classic(base_size=15)

# Keep Species for mapping but exclude from actual pairs
iris_data <- iris %>%
  select(Species, where(is.numeric)) %>%
  rename(
    S_length = Sepal.Length,
    S_width = Sepal.Width,
    P_length = Petal.Length,
    P_width = Petal.Width
  ) 

ggpairs( iris_data,
  columns = 2:5,  # 👈 skip the first column (Species)
  mapping = aes(color = Species),
  lower = list(continuous = "points", combo = "dot_no_facet"),
  progress = FALSE,
  title = "Iris"
) +
  theme_classic(base_size = 15)

Ok it is unlikely one would master all data-wrangling needed - ever, because each project would require a different set of options, Google and chatGPT is your friend. Often you can do everything in base R or tidyverse equally well it is just a style preference. Just like learning a normal language, it does not make sense to learn the vocabulary if you do not have anything to say, so the best way to learn is to start using the skills to analyse data

the simplest thing we can do is do a t-test, lets compare the petal length of the setosa and virginica

iris_t_Test_data <- iris %>% select(Species,Petal.Length) %>% filter(Species %in% c('setosa', 'virginica'))

#
t.test(Petal.Length ~ Species, data= iris_t_Test_data, var.equal=TRUE)# note 
## 
##  Two Sample t-test
## 
## data:  Petal.Length by Species
## t = -49.986, df = 98, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group setosa and group virginica is not equal to 0
## 95 percent confidence interval:
##  -4.252374 -3.927626
## sample estimates:
##    mean in group setosa mean in group virginica 
##                   1.462                   5.552
var.test(Petal.Length ~ Species, data= iris_t_Test_data, var.equal=TRUE)# note 
## 
##  F test to compare two variances
## 
## data:  Petal.Length by Species
## F = 0.099016, num df = 49, denom df = 49, p-value = 1.875e-13
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.05618945 0.17448557
## sample estimates:
## ratio of variances 
##          0.0990164
ggplot(data = iris_t_Test_data, aes(x= Species, y = Petal.Length)) + geom_boxplot() + theme_minimal()

ggplot(data = iris_t_Test_data, aes(x= Petal.Length, fill = Species)) + geom_histogram() + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

model_linear <- lm(Petal.Length ~ Species, data= iris_t_Test_data)
summary(model_linear)
## 
## Call:
## lm(formula = Petal.Length ~ Species, data = iris_t_Test_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0520 -0.1620  0.0380  0.1405  1.3480 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.46200    0.05786   25.27   <2e-16 ***
## Speciesvirginica  4.09000    0.08182   49.99   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4091 on 98 degrees of freedom
## Multiple R-squared:  0.9623, Adjusted R-squared:  0.9619 
## F-statistic:  2499 on 1 and 98 DF,  p-value: < 2.2e-16

What if we want to compare all species

anova in base R - https://bookdown.org/steve_midway/DAR/understanding-anova-in-r.html https://statsandr.com/blog/anova-in-r/ Remember it expects that the predictor IV is a Factor

contrasts(iris$Species) <- "contr.treatment"
lm_model <- lm(Petal.Length ~ Species, data = iris)
summary(lm_model)
## 
## Call:
## lm(formula = Petal.Length ~ Species, data = iris)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.260 -0.258  0.038  0.240  1.348 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.46200    0.06086   24.02   <2e-16 ***
## Speciesversicolor  2.79800    0.08607   32.51   <2e-16 ***
## Speciesvirginica   4.09000    0.08607   47.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4303 on 147 degrees of freedom
## Multiple R-squared:  0.9414, Adjusted R-squared:  0.9406 
## F-statistic:  1180 on 2 and 147 DF,  p-value: < 2.2e-16
contrasts(iris$Species)
##            versicolor virginica
## setosa              0         0
## versicolor          1         0
## virginica           0         1
# One way anova
one.way <- aov(Petal.Length ~ Species, data = iris) 
print(one.way$coefficients)
##       (Intercept) Speciesversicolor  Speciesvirginica 
##             1.462             2.798             4.090
summary(one.way)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## Species       2  437.1  218.55    1180 <2e-16 ***
## Residuals   147   27.2    0.19                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#one_way_Welch<- oneway.test(Petal.Length ~ Species, data = iris)
#OneWay_Kruskal<- kruskal.test(Petal.Length ~ Species, data = iris)

#plot(one.way)

print(TukeyHSD(one.way))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Petal.Length ~ Species, data = iris)
## 
## $Species
##                       diff     lwr     upr p adj
## versicolor-setosa    2.798 2.59422 3.00178     0
## virginica-setosa     4.090 3.88622 4.29378     0
## virginica-versicolor 1.292 1.08822 1.49578     0
# library(multcomp)
# summary(glht(res.aov, linfct = mcp(group = "Tukey")))
# see https://www.sthda.com/english/wiki/one-way-anova-test-in-r
# pairwise.t.test(my_data$weight, my_data$group, p.adjust.method = "BH")
# anova(lm_model)

# options(contrasts = c("contr.sum", "contr.poly"))

# For sum-to-zero (deviation) coding
contrasts(iris$Species) <- "contr.sum"
# contrasts(iris$Species) <- "contr.helmert"

(contrasts(iris$Species) )
##            [,1] [,2]
## setosa        1    0
## versicolor    0    1
## virginica    -1   -1
lm_sum<- lm(Petal.Length ~ Species, data= iris)
summary(lm_sum)
## 
## Call:
## lm(formula = Petal.Length ~ Species, data = iris)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.260 -0.258  0.038  0.240  1.348 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.75800    0.03514  106.95   <2e-16 ***
## Species1    -2.29600    0.04969  -46.21   <2e-16 ***
## Species2     0.50200    0.04969   10.10   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4303 on 147 degrees of freedom
## Multiple R-squared:  0.9414, Adjusted R-squared:  0.9406 
## F-statistic:  1180 on 2 and 147 DF,  p-value: < 2.2e-16

Grand mean: 3.758

Setosa’s mean: 3.758 + (-2.296) = 1.462

Versicolor’s mean: 3.758 + 0.502 = 4.26

Virginica’s mean: 3.758 + 1.794 = 5.552

these re-parametrisations of models are explained well here - https://www.youtube.com/playlist?list=PLB2iAtgpI4YHUJcyXnaoR3sWKSdhYDtjD

ggplot(iris, aes(x = Species, y = Petal.Length, fill = Species)) +
  stat_summary(fun = mean, geom = "bar", color = "black", width = 0.6) +
  stat_summary(fun.data = mean_se, geom = "errorbar", width = 0.2) +
  theme_classic(base_size = 15)

see for options - https://www.sthda.com/english/articles/index.php?url=/24-ggpubr-publication-ready-plots/76-add-p-values-and-significance-levels-to-ggplots/

library(ggpubr)

# Perorm pairwise comparisons
compare_means(Petal.Length ~ Species,  data = iris, paired = FALSE, effectsize = TRUE, method="t.test")
## # A tibble: 3 × 8
##   .y.          group1     group2            p   p.adj p.format p.signif method
##   <chr>        <chr>      <chr>         <dbl>   <dbl> <chr>    <chr>    <chr> 
## 1 Petal.Length setosa     versicolor 9.93e-46 2  e-45 <2e-16   ****     T-test
## 2 Petal.Length setosa     virginica  9.27e-50 2.8e-49 <2e-16   ****     T-test
## 3 Petal.Length versicolor virginica  4.90e-22 4.9e-22 <2e-16   ****     T-test
my_comparisons <- list(c('setosa','versicolor'), c('setosa','virginica'), c('versicolor','virginica'))

ggboxplot(iris, x = "Species", y = "Petal.Length") +  
  stat_compare_means(comparisons = my_comparisons, method = "t.test", label = "p.signif") + 
  stat_compare_means(method = "anova",label.y = 9)

There are other packages for anovas like easyanova and ezANOVA, but often one can also do a lm model like above as long as consideration is taken about the contrast coding. e.g., lm(Y ~ Factor1*Factor2) - would give you main effect of factor 1, main effect of factor 2 and their interaction.

Regression model

rm(list=ls()) # cleans environment
#library(ggplot2)
library(tidyverse)

url<- 'https://osf.io/download/kbd84/'
# https://doi.org/10.1016/j.neurobiolaging.2023.11.009

library(readr)

data_df <- read_csv(url)
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
##   CCID = col_character(),
##   age = col_double(),
##   Sex = col_double(),
##   CattellPCA = col_double(),
##   Memory_700 = col_double(),
##   Mov_SyS = col_double(),
##   Rest_SyS = col_double(),
##   SMT_SyS = col_double(),
##   TIV = col_double(),
##   Education = col_double()
## )
head(data_df)
## # A tibble: 6 × 10
##   CCID       age   Sex CattellPCA Memory_700 Mov_SyS Rest_SyS SMT_SyS   TIV Education
##   <chr>    <dbl> <dbl>      <dbl>      <dbl>   <dbl>    <dbl>   <dbl> <dbl>     <dbl>
## 1 CC110033    24     1       21.1       23.7   0.691    0.704   0.684  1.37         3
## 2 CC110037    18     1       15.3       20.8   0.657    0.702   0.715  1.32         2
## 3 CC110045    24     2       20.6       23.6   0.729    0.725   0.735  1.35         3
## 4 CC110056    22     2       12.5       21.5   0.568    0.595   0.698  1.25         3
## 5 CC110062    20     1      NaN         22.1 NaN      NaN     NaN      1.55         2
## 6 CC110069    28     2       20.7       30.7   0.651    0.657   0.711  1.36         3

This is real data before you continue you can try to clean it a bit 1) Remove rows where there are NaN values for the main variables we would focus on - CattellPCA,Memory_700, Mov_SyS,Rest_SyS,SMT_SyS 2) Create a new column that is has sex recoded as -1 and 1, and make Sex a factor with 1 - being Male, 2 being Female 3) Plot how Mov_SyS differs across age 4) Fit a linear model predicting Mov_SyS with ageSex 5) Fit a model predicting CattellPCA ~ SyS + ageSex 6) Plot partial regression to understand the model

Solution

df_Schaefer <- data_df %>% filter(!is.na(CattellPCA),
                   !is.na(Memory_700),
                   !is.na(Mov_SyS),
                   !is.na(Rest_SyS),
                   !is.na(SMT_SyS)) %>% 
  mutate(Sex = factor(Sex, labels = c("Male","Female")), 
         SexC = case_when(
           Sex == "Male" ~ -1,
           Sex == "Female"~ 1,
           TRUE ~ NA_real_
         ))

head(df_Schaefer)
## # A tibble: 6 × 11
##   CCID       age Sex    CattellPCA Memory_700 Mov_SyS Rest_SyS SMT_SyS   TIV Education  SexC
##   <chr>    <dbl> <fct>       <dbl>      <dbl>   <dbl>    <dbl>   <dbl> <dbl>     <dbl> <dbl>
## 1 CC110033    24 Male         21.1       23.7   0.691    0.704   0.684  1.37         3    -1
## 2 CC110037    18 Male         15.3       20.8   0.657    0.702   0.715  1.32         2    -1
## 3 CC110045    24 Female       20.6       23.6   0.729    0.725   0.735  1.35         3     1
## 4 CC110056    22 Female       12.5       21.5   0.568    0.595   0.698  1.25         3     1
## 5 CC110069    28 Female       20.7       30.7   0.651    0.657   0.711  1.36         3     1
## 6 CC110087    28 Female       18.2       22.2   0.675    0.652   0.700  1.25         3     1

Needed later on optional but one can use a polynomial expansion of age to account for curved relationship between age and SyS - sometimes base R is easier than tidyverse

df_Schaefer$ageQuad <- poly(df_Schaefer$age, 2)
df_Schaefer$agepoly_1  <- df_Schaefer$ageQuad[,1]
df_Schaefer$agepoly_2 <- df_Schaefer$ageQuad[,2]

df_Schaefer$agepoly_1z <- datawizard::standardise(df_Schaefer$agepoly_1) 
df_Schaefer$agepoly_2z <- datawizard::standardise(df_Schaefer$agepoly_2) 
### MOVIE
p <- ggplot(df_Schaefer, aes(x = age, y = Mov_SyS ))
p <- p + geom_point(shape = 21, size = 3, colour = "black", fill = "white", stroke = 2)
p <- p + stat_smooth(method = "lm", se = TRUE, fill = "grey60", formula = y ~ poly(x,2, raw = TRUE), colour = "springgreen3", linewidth = 3)
#formatting
p <- p + scale_x_continuous(breaks = round(seq(20, max(80), by = 20),1),limits = c(15,90)) + #ylim(0,400) +
  theme_bw() + theme(panel.border = element_blank(), legend.position = "none",text = element_text(size=18),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black",size = 1.5),axis.ticks = element_blank()) + labs(x='Age',y='Functional Segregation',title='Movie: System Segregation and Age') +theme(plot.title = element_text(hjust = 0.5))
p

# Movie
lm_model_Movie <- lm(Mov_SyS ~ agepoly_1z + agepoly_2z + SexC + agepoly_1z:SexC + agepoly_2z:SexC,
                 data = df_Schaefer); 

summary(lm_model_Movie) 
## 
## Call:
## lm(formula = Mov_SyS ~ agepoly_1z + agepoly_2z + SexC + agepoly_1z:SexC + 
##     agepoly_2z:SexC, data = df_Schaefer)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31635 -0.02377  0.00645  0.02964  0.12212 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.6163288  0.0019974 308.564  < 2e-16 ***
## agepoly_1z      -0.0454595  0.0019999 -22.731  < 2e-16 ***
## agepoly_2z      -0.0108376  0.0019993  -5.421  8.5e-08 ***
## SexC             0.0015817  0.0019974   0.792    0.429    
## agepoly_1z:SexC -0.0003821  0.0019999  -0.191    0.849    
## agepoly_2z:SexC -0.0008270  0.0019993  -0.414    0.679    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04999 on 621 degrees of freedom
## Multiple R-squared:  0.4689, Adjusted R-squared:  0.4646 
## F-statistic: 109.7 on 5 and 621 DF,  p-value: < 2.2e-16
library(heplots)
etasq(lm_model_Movie,anova=FALSE,partial = TRUE)
##                 Partial eta^2
## agepoly_1z       4.542285e-01
## agepoly_2z       4.527170e-02
## SexC             1.009443e-03
## agepoly_1z:SexC  5.877863e-05
## agepoly_2z:SexC  2.754844e-04
## Residuals                  NA
car::linearHypothesis(lm_model_Movie,c('agepoly_1z=0','agepoly_2z=0'))
## 
## Linear hypothesis test:
## agepoly_1z = 0
## agepoly_2z = 0
## 
## Model 1: restricted model
## Model 2: Mov_SyS ~ agepoly_1z + agepoly_2z + SexC + agepoly_1z:SexC + 
##     agepoly_2z:SexC
## 
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    623 2.9175                                  
## 2    621 1.5521  2    1.3654 273.16 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# MOVIE
lm_model_M <- lm(CattellPCA ~ scale(Mov_SyS)*agepoly_1z*SexC + scale(Mov_SyS)*agepoly_2z*SexC,
                 data = df_Schaefer); 

summary(lm_model_M)
## 
## Call:
## lm(formula = CattellPCA ~ scale(Mov_SyS) * agepoly_1z * SexC + 
##     scale(Mov_SyS) * agepoly_2z * SexC, data = df_Schaefer)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.0098 -1.5922  0.1145  1.7964  6.0411 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    16.04076    0.14161 113.277  < 2e-16 ***
## scale(Mov_SyS)                  0.60170    0.15441   3.897 0.000108 ***
## agepoly_1z                     -1.93255    0.15815 -12.219  < 2e-16 ***
## SexC                           -0.27822    0.14161  -1.965 0.049893 *  
## agepoly_2z                     -0.50527    0.13203  -3.827 0.000143 ***
## scale(Mov_SyS):agepoly_1z      -0.13875    0.16115  -0.861 0.389574    
## scale(Mov_SyS):SexC            -0.18044    0.15441  -1.169 0.243013    
## agepoly_1z:SexC                -0.10541    0.15815  -0.667 0.505335    
## scale(Mov_SyS):agepoly_2z      -0.10384    0.11965  -0.868 0.385799    
## SexC:agepoly_2z                -0.01222    0.13203  -0.093 0.926300    
## scale(Mov_SyS):agepoly_1z:SexC -0.03240    0.16115  -0.201 0.840714    
## scale(Mov_SyS):SexC:agepoly_2z -0.04806    0.11965  -0.402 0.688073    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.45 on 615 degrees of freedom
## Multiple R-squared:  0.4859, Adjusted R-squared:  0.4767 
## F-statistic: 52.84 on 11 and 615 DF,  p-value: < 2.2e-16
etasq(lm_model_M)
##                                Partial eta^2
## scale(Mov_SyS)                  2.541466e-02
## agepoly_1z                      2.281567e-01
## SexC                            1.040679e-02
## agepoly_2z                      2.746409e-02
## scale(Mov_SyS):agepoly_1z       1.160089e-03
## scale(Mov_SyS):SexC             3.545192e-03
## agepoly_1z:SexC                 1.034644e-03
## scale(Mov_SyS):agepoly_2z       1.338773e-03
## SexC:agepoly_2z                 6.150349e-05
## scale(Mov_SyS):agepoly_1z:SexC  6.573073e-05
## scale(Mov_SyS):SexC:agepoly_2z  2.622582e-04
## Residuals                                 NA
library(visreg)
visreg(lm_model_M)
## Warning:   Note that you are attempting to plot a 'main effect' in a model that contains an
##   interaction.  This is potentially misleading; you may wish to consider using the 'by'
##   argument.
## Conditions used in construction of plot
## agepoly_1z: 0.0322679
## SexC: -1
## agepoly_2z: -0.3346304

## Warning:   Note that you are attempting to plot a 'main effect' in a model that contains an
##   interaction.  This is potentially misleading; you may wish to consider using the 'by'
##   argument.
## Conditions used in construction of plot
## Mov_SyS: 0.6346401
## SexC: -1
## agepoly_2z: -0.3346304

## Warning:   Note that you are attempting to plot a 'main effect' in a model that contains an
##   interaction.  This is potentially misleading; you may wish to consider using the 'by'
##   argument.
## Conditions used in construction of plot
## Mov_SyS: 0.6346401
## agepoly_1z: 0.0322679
## agepoly_2z: -0.3346304

## Warning:   Note that you are attempting to plot a 'main effect' in a model that contains an
##   interaction.  This is potentially misleading; you may wish to consider using the 'by'
##   argument.
## Conditions used in construction of plot
## Mov_SyS: 0.6346401
## agepoly_1z: 0.0322679
## SexC: -1

Ok what if we want a mixed effect model you can watch this series for a bit of theory - https://www.youtube.com/watch?v=5D5zEA29tu4&list=PLB2iAtgpI4YEAUiEQ1ZnfMXY-yewNzn9z&index=3&t=110s You can think of them as fancy regression, but they can be a bit involved especially when dealing with non-normal data. Multiple free courses available online Main package for this is lme4

library(lme4)
head(sleepstudy)
##   Reaction Days Subject
## 1 249.5600    0     308
## 2 258.7047    1     308
## 3 250.8006    2     308
## 4 321.4398    3     308
## 5 356.8519    4     308
## 6 414.6901    5     308
library(lattice)
# often a nice quick way to see multi-level data
# 1. Simple scatterplot matrix with conditioning
xyplot(Reaction ~ Days | Subject, data = sleepstudy,
       layout = c(4, 5),
       type = c("p", "r"),   # points + regression line
       main = "Reaction Time by Days for Each Subject")

ggplot(sleepstudy, aes(Days, Reaction, group=Subject, colour=Subject)) +
  geom_point() + 
  facet_wrap(~Subject, ncol=9) + 
  scale_x_continuous(limits=c(0, 10),breaks=c(0,10)) +
  theme_minimal()

# 1. Basic scatter plot of Reaction vs. Days, colored by Subject
ggplot(sleepstudy, aes(x = Days, y = Reaction, color = Subject)) +
  geom_point(show.legend = FALSE) +
  theme_minimal(base_size = 15) +
  labs(title = "Reaction Time by Days of Sleep Deprivation",
       y = "Reaction Time (ms)")

# 2. Add individual linear trends (random slopes visualization!)
ggplot(sleepstudy, aes(x = Days, y = Reaction, group = Subject, color = Subject)) +
  geom_point(show.legend = FALSE) +
  geom_line() +
  theme_minimal(base_size = 15) +
  labs(title = "Individual Reaction Time Trajectories",
       y = "Reaction Time (ms)")

# 3. Overall linear trend (fixed effect only) + individual lines
ggplot(sleepstudy, aes(x = Days, y = Reaction)) +
  geom_point(aes(color = Subject)) +
  geom_line(aes(group = Subject, color = Subject), alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "black", size = 1.2) +
  theme_minimal(base_size = 15) +
  labs(title = "Overall Trend and Individual Slopes",
       y = "Reaction Time (ms)")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(sleepstudy, aes(x = Days, y = Reaction, color = Subject)) +
  geom_point(alpha = 0.5, show.legend = FALSE) +
  # Smooth line **per Subject**
  geom_smooth(method = "lm", se = FALSE, size = 0.7, linetype = "dashed", show.legend = FALSE) +
  # Overall smooth line (no grouping)
  geom_smooth(aes(group = 1), method = "lm", se = FALSE, color = "black", size = 1.2) +
  theme_minimal(base_size = 15) +
  labs(title = "Per-Subject and Overall Trends",
       y = "Reaction Time (ms)")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

example from - https://cdsbasel.github.io/dataanalytics_rsessions/_sessions/CausalInference/intro_lme4.html

model1 <- lm(Reaction ~ 1, sleepstudy) # intercept model
summary(model1)
## 
## Call:
## lm(formula = Reaction ~ 1, data = sleepstudy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -104.176  -43.132   -9.857   38.244  167.846 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  298.508      4.198    71.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56.33 on 179 degrees of freedom
datafit=fitted(model1)

ggplot(sleepstudy, aes(Days, Reaction, group=Subject, colour=Subject)) + 
  geom_point() + 
  geom_line(aes(y=datafit), linetype=2) + 
  facet_wrap(~Subject, ncol=9) + 
  scale_x_continuous(limits=c(0, 10),breaks=c(0,10)) +
  theme_minimal()

model2 <- lmer(Reaction ~ 1 + (1 | Subject), sleepstudy)
summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Reaction ~ 1 + (1 | Subject)
##    Data: sleepstudy
## 
## REML criterion at convergence: 1904.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.4983 -0.5501 -0.1476  0.5123  3.3446 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Subject  (Intercept) 1278     35.75   
##  Residual             1959     44.26   
## Number of obs: 180, groups:  Subject, 18
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   298.51       9.05   32.98
datafit=fitted(model2)
ggplot(sleepstudy, aes(Days, Reaction, group=Subject, colour=Subject)) + 
  geom_point() + 
  geom_line(aes(y=datafit), linetype=2) + 
  facet_wrap(~Subject, ncol=9) + 
  scale_x_continuous(limits=c(0, 10),breaks=c(0,10)) +
  theme_minimal()

model3 <- lmer(Reaction ~ 1 + Days + (1 | Subject), sleepstudy)
summary(model3)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Reaction ~ 1 + Days + (1 | Subject)
##    Data: sleepstudy
## 
## REML criterion at convergence: 1786.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2257 -0.5529  0.0109  0.5188  4.2506 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Subject  (Intercept) 1378.2   37.12   
##  Residual              960.5   30.99   
## Number of obs: 180, groups:  Subject, 18
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 251.4051     9.7467   25.79
## Days         10.4673     0.8042   13.02
## 
## Correlation of Fixed Effects:
##      (Intr)
## Days -0.371
datafit=fitted(model3)
ggplot(sleepstudy, aes(Days, Reaction, group=Subject, colour=Subject)) + 
  geom_point() + 
  geom_line(aes(y=datafit), linetype=2) + 
  facet_wrap(~Subject, ncol=9) + 
  scale_x_continuous(limits=c(0, 10),breaks=c(0,10)) +
  theme_minimal()

model4 <- lmer(Reaction ~ 1 + Days + (1 + Days | Subject), sleepstudy)
summary(model4)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Reaction ~ 1 + Days + (1 + Days | Subject)
##    Data: sleepstudy
## 
## REML criterion at convergence: 1743.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9536 -0.4634  0.0231  0.4634  5.1793 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr
##  Subject  (Intercept) 612.10   24.741       
##           Days         35.07    5.922   0.07
##  Residual             654.94   25.592       
## Number of obs: 180, groups:  Subject, 18
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  251.405      6.825  36.838
## Days          10.467      1.546   6.771
## 
## Correlation of Fixed Effects:
##      (Intr)
## Days -0.138
datafit=fitted(model4)
ggplot(sleepstudy, aes(Days, Reaction, group=Subject, colour=Subject)) + 
  geom_point() + 
  geom_line(aes(y=datafit), linetype=2) + 
  facet_wrap(~Subject, ncol=9) + 
  scale_x_continuous(limits=c(0, 10),breaks=c(0,10)) +
  theme_minimal()

mlm_model_rand_slope <- lmer(Reaction ~ Days + (Days| Subject), data =sleepstudy)
summary(mlm_model_rand_slope)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Reaction ~ Days + (Days | Subject)
##    Data: sleepstudy
## 
## REML criterion at convergence: 1743.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9536 -0.4634  0.0231  0.4634  5.1793 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr
##  Subject  (Intercept) 612.10   24.741       
##           Days         35.07    5.922   0.07
##  Residual             654.94   25.592       
## Number of obs: 180, groups:  Subject, 18
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  251.405      6.825  36.838
## Days          10.467      1.546   6.771
## 
## Correlation of Fixed Effects:
##      (Intr)
## Days -0.138
library(ggeffects)


PRED <- ggpredict(mlm_model_rand_slope, terms=c('Days'))
plot(PRED)

#fixef(mlm_model_rand_slope)
#ranef(mlm_model_rand_slope)

What if we just did a separate regression for each subject

# Fit separate models for each Subject
separate_models <- sleepstudy %>%
  group_by(Subject) %>%
  do(model = lm(Reaction ~ Days, data = .))

# Extract the slope estimates for each subject
separate_slopes <- separate_models %>%
  mutate(
    intercept = coef(model)[1],
    slope = coef(model)[2]
  ) %>%
  select(Subject, intercept, slope)

print(separate_slopes)
## # A tibble: 18 × 3
## # Rowwise: 
##    Subject intercept slope
##    <fct>       <dbl> <dbl>
##  1 308          244. 21.8 
##  2 309          205.  2.26
##  3 310          203.  6.11
##  4 330          290.  3.01
##  5 331          286.  5.27
##  6 332          264.  9.57
##  7 333          275.  9.14
##  8 334          240. 12.3 
##  9 335          263. -2.88
## 10 337          290. 19.0 
## 11 349          215. 13.5 
## 12 350          226. 19.5 
## 13 351          261.  6.43
## 14 352          276. 13.6 
## 15 369          255. 11.3 
## 16 370          210. 18.1 
## 17 371          254.  9.19
## 18 372          267. 11.3
# Random slopes from lmer
ranef_slopes <- ranef(mlm_model_rand_slope)$Subject %>%
  tibble::rownames_to_column("Subject") %>%
  select(Subject, slope = Days)

# Fixed effect slope
fixed_slope <- fixef(mlm_model_rand_slope)["Days"]

# Compute total slopes for each subject in lmer
ranef_slopes <- ranef_slopes %>%
  mutate(total_slope = fixed_slope + slope)

print(ranef_slopes)
##    Subject       slope total_slope
## 1      308   9.1989758  19.6662617
## 2      309  -8.6196806   1.8476053
## 3      310  -5.4488565   5.0184295
## 4      330  -4.8143503   5.6529356
## 5      331  -3.0699116   7.3973743
## 6      332  -0.2721770  10.1951090
## 7      333  -0.2236361  10.2436499
## 8      334   1.0745816  11.5418676
## 9      335 -10.7521652  -0.2848792
## 10     337   8.6282652  19.0955511
## 11     349   1.1734322  11.6407181
## 12     350   6.6142178  17.0815038
## 13     351  -3.0152621   7.4520239
## 14     352   3.5360011  14.0032871
## 15     369   0.8722149  11.3395008
## 16     370   4.8224850  15.2897709
## 17     371  -0.9881562   9.4791297
## 18     372   1.2840221  11.7513080
comparison <- separate_slopes %>%
  left_join(ranef_slopes, by = "Subject")

print(comparison)
## # A tibble: 18 × 5
## # Rowwise: 
##    Subject intercept slope.x slope.y total_slope
##    <chr>       <dbl>   <dbl>   <dbl>       <dbl>
##  1 308          244.   21.8    9.20       19.7  
##  2 309          205.    2.26  -8.62        1.85 
##  3 310          203.    6.11  -5.45        5.02 
##  4 330          290.    3.01  -4.81        5.65 
##  5 331          286.    5.27  -3.07        7.40 
##  6 332          264.    9.57  -0.272      10.2  
##  7 333          275.    9.14  -0.224      10.2  
##  8 334          240.   12.3    1.07       11.5  
##  9 335          263.   -2.88 -10.8        -0.285
## 10 337          290.   19.0    8.63       19.1  
## 11 349          215.   13.5    1.17       11.6  
## 12 350          226.   19.5    6.61       17.1  
## 13 351          261.    6.43  -3.02        7.45 
## 14 352          276.   13.6    3.54       14.0  
## 15 369          255.   11.3    0.872      11.3  
## 16 370          210.   18.1    4.82       15.3  
## 17 371          254.    9.19  -0.988       9.48 
## 18 372          267.   11.3    1.28       11.8

For mixed effect models https://www.bristol.ac.uk/cmm/learning/online-course/ - great course # also great resource - https://stats.oarc.ucla.edu/other/mult-pkg/introduction-to-linear-mixed-models/ More detailed course - https://m-clark.github.io/mixed-models-with-R/random_intercepts.html

Practical example - https://osf.io/s3d5z/files/osfstorage see Paper_Analyses.html linked to this paper doi:10.1037/xge0001462

Disclaimer the fact ggplot relies on specific data structure e.g., dataframe can make it a bit less flexible when you want to add multiple columns on top of each other - e.g., hold on in Matlab https://www.geeksforgeeks.org/draw-multiple-overlaid-histograms-with-ggplot2-package-in-r/

ggplot() + geom_histogram(data = iris,aes(Petal.Length), alpha = 0.5, fill="red") +
           geom_histogram(data =iris, aes(Sepal.Length), alpha = 0.5, fill="skyblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Determine the range for the x-axis
# x_min <- min(iris$Petal.Length, iris$Sepal.Length)
# x_max <- max(iris$Petal.Length, iris$Sepal.Length)
# 
# # Create the first histogram for Petal.Length
# hist(iris$Petal.Length,
#      breaks = 20,
#      col = rgb(1, 0, 0, 0.5),  # Semi-transparent red
#      xlim = c(x_min, x_max),
#      xlab = "Length (cm)",
#      ylab = "Frequency",
#      main = "Overlay of Petal and Sepal Lengths")
# 
# # Overlay the second histogram for Sepal.Length
# hist(iris$Sepal.Length,
#      breaks = 20,
#      col = rgb(0, 0, 1, 0.5),  # Semi-transparent blue
#      add = TRUE)
# 
# # Add a legend to distinguish the histograms
# legend("topright",
#        legend = c("Petal Length", "Sepal Length"),
#        fill = c(rgb(1, 0, 0, 0.5), rgb(0, 0, 1, 0.5)))

It is common to create some functions that wrap around options for plots one wants - this one is taken from stackoverflow

# updated for newer versions of ggplot -
# plot_histogram <- function(df, feature) {
#     plt <- ggplot(df, aes(x=eval(parse(text=feature)))) +
#     geom_histogram(aes(y = ..density..), alpha=0.7, fill="#33AADE", color="black") +
#     geom_density(alpha=0.3, fill="red") +
#     geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
#     labs(x=feature, y = "Density")
#     print(plt)
# }

## eval(parse(text="2+2")) / # eval(2+2) - this is a trick with which we can evaluate the feature argument as a column name
## {{feature }} - is the tidy version of evaluation

plot_histogram <- function(df, feature) {
  ggplot(df, aes(x = {{ feature }})) +
    geom_histogram(aes(y = after_stat(density)), alpha = 0.7, fill = "#33AADE", color = "black") +
    geom_density(alpha = 0.3, fill = "red") +
    geom_vline(aes(xintercept = mean({{ feature }})), color = "black", linetype = "dashed", size = 1) +
    labs(
      title = paste("Histogram with Density for", rlang::as_name(rlang::ensym(feature))),
      x = rlang::as_name(rlang::ensym(feature)),
      y = "Density"
    ) +
    theme_minimal()
}


plot_multi_histogram <- function(df, feature, label_column) {
  ggplot(df, aes(x = {{ feature }}, fill = {{ label_column }})) +
    geom_histogram(
      aes(y = after_stat(density)),
      position = "identity",
      alpha = 0.6,
      color = "black",
      bins = 30
    ) +
    geom_density(alpha = 0.4) +
    geom_vline(
      aes(xintercept = mean({{ feature }})),
      color = "black",
      linetype = "dashed",
      size = 1
    ) +
    labs(
      title = paste("Histogram of", rlang::as_name(rlang::ensym(feature)), "by", rlang::as_name(rlang::ensym(label_column))),
      x = rlang::as_name(rlang::ensym(feature)),
      y = "Density",
      fill = rlang::as_name(rlang::ensym(label_column))
    ) +
    theme_minimal()
}
plot_histogram(iris, Petal.Length)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot_multi_histogram_with_group_means <- function(df, value_col, group_col) {
  value_col <- rlang::ensym(value_col)
  group_col <- rlang::ensym(group_col)

  # Step 1: Compute group-wise means
  group_means <- df %>%
    group_by(!!group_col) %>%
    summarize(mean_value = mean(!!value_col), .groups = "drop")

  # Step 2: Plot histogram + group-wise mean lines
  ggplot(df, aes(x = !!value_col, fill = !!group_col)) +
    geom_histogram(aes(y = after_stat(density)), alpha = 0.6, position = "identity", color = "black", bins = 30) +
    geom_density(alpha = 0.3) +
    geom_vline(
      data = group_means,
      aes(xintercept = mean_value, color = !!group_col),
      linetype = "dashed",
      size = 1,
      inherit.aes = FALSE,
      show.legend = FALSE
    ) +
    labs(
      title = paste("Histogram of", rlang::as_name(value_col), "by", rlang::as_name(group_col)),
      x = rlang::as_name(value_col),
      y = "Density",
      fill = rlang::as_name(group_col)
      #color = paste("Mean of", rlang::as_name(group_col))
    ) +
    theme_minimal()
}


a <- data.frame(n = rnorm(1000, mean = 1), category = "A")
b <- data.frame(n = rnorm(1000, mean = 2), category = "B")
c <- data.frame(n = rnorm(1000, mean = 3), category = "C")
d <- data.frame(n = rnorm(1000, mean = 4), category = "D")
e <- data.frame(n = rnorm(1000, mean = 5), category = "E")
f <- data.frame(n = rnorm(1000, mean = 6), category = "F")

many_distros <- bind_rows(a, b, c, d, e, f)

plot_multi_histogram_with_group_means(many_distros, n, category)
## Warning in geom_vline(data = group_means, aes(xintercept = mean_value, color = !!group_col), : Ignoring unknown parameters:
## `inherit.aes`